Adding remaining conv, dynamic_op, and scaleadd_scaleadd_relu flavors for grouped conv fwd (#3529)

* Adding remaining flavors for grouped conv fwd As titled. Following variants are added: - grouped_conv2d_fwd_dynamic_op - grouped_conv3d_fwd_dynamic_op - grouped_conv3d_fwd_bilinear - grouped_conv3d_fwd_convscale - grouped_conv3d_fwd_convinvscale - grouped_conv3d_fwd_convscale_add - grouped_conv3d_fwd_convscale_relu - grouped_conv3d_fwd_scale - grouped_conv3d_fwd_combconvscale - grouped_conv3d_fwd_scaleadd_scaleadd_relu * Fix incomplete parsing of types from source names in add_instance_library() cmakelists function so we don't build f8 on RDNA3. * Do not build f8 / bf8 only flavor tests on RDNA3 * Make sure we have proper generic instances for all instance lists related to the post-ces extra flavors, with scalarPerVector = 1. Then disable all but one generic instance per instance list to reduce compile time. * Post rebase fix: Template parameters for Grouped Conv Fwd Device Impl got tweaked upstream. * adding int8 and fp16 overloads to the elementwise operations * fixed copilot nits * Addressing review comments: - removed unnecessary examples for dynamic op - removed unnecessary conv specalizations for all the flavors - removed spurious bilinear and scale source files * clang-format * reduced no of tests --------- Co-authored-by: Wojciech Laskowski <wojciech.laskowski@streamhpc.com>
2026-04-20 14:59:17 +00:00 · 2026-01-30 17:02:14 +01:00
parent 6a6177a246
commit 2377a62837
72 changed files with 5178 additions and 34 deletions
--- a/example/62_convnd_activ/convinvscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convinvscale/CMakeLists.txt
@@ -5,4 +5,11 @@ if (NOT GPU_TARGETS MATCHES "gfx11")
    add_custom_target(example_convnd_activ_xdl_convinvscale)
    add_example_executable(example_convnd_fwd_xdl_convinvscale_fp8 convnd_fwd_xdl_convinvscale_fp8.cpp)
    add_example_dependencies(example_convnd_activ_xdl_convinvscale example_convnd_fwd_xdl_convinvscale_fp8)
-endif()
+endif()
+
+# WMMA
+if (GPU_TARGETS MATCHES "gfx12")
+    add_custom_target(example_convnd_activ_wmma_convinvscale)
+    add_example_executable(example_convnd_fwd_wmma_convinvscale_fp8 convnd_fwd_wmma_convinvscale_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_wmma_convinvscale example_convnd_fwd_wmma_convinvscale_fp8)
+endif()
--- a/example/62_convnd_activ/convinvscale/convnd_fwd_wmma_convinvscale_fp8.cpp
+++ b/example/62_convnd_activ/convinvscale/convnd_fwd_wmma_convinvscale_fp8.cpp
@@ -0,0 +1,98 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "convnd_fwd_convinvscale_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using DsDataType       = ck::Tuple<>;
+using OutDataType      = ck::f8_t;
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvInvscale;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
+        NDimSpatial,      // NDimSpatial
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        DsLayout,         // DsLayout (empty tuple for ConvInvScale)
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        DsDataType,       // DsDataType (empty tuple)
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        64,               // BlockSize
+        64,               // MPerBlock
+        64,               // NPerBlock
+        32,               // KPerBlock
+        8,                // AK1
+        8,                // BK1
+        16,               // MPerWmma
+        16,               // NPerWmma
+        4,                // MRepeat
+        2,                // NRepeat
+        S<4, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        1,                // ABlockTransferSrcScalarPerVector
+        8,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        1,                // BBlockTransferSrcScalarPerVector
+        8,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMRepeatPerShuffle
+        1,                // CShuffleNRepeatPerShuffle
+        S<1, 16, 1, 4>,   // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        true,                                      // UseThreadTileTransfer
+        AComputeDataType,                          // AComputeDataType
+        BComputeDataType,                          // BComputeDataType
+        1>;                                        // NumGroupsToMerge
+
+#include "run_convnd_fwd_convinvscale_example.inc"
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "This kernel support gfx12 only" << std::endl;
+
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/62_convnd_activ/convscale/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale/CMakeLists.txt
@@ -15,3 +15,19 @@ if (NOT GPU_TARGETS MATCHES "gfx11")
    add_example_executable(example_convnd_fwd_xdl_convscale_bf8_fp8 convnd_fwd_xdl_convscale_bf8_fp8.cpp)
    add_example_dependencies(example_convnd_activ_xdl_convscale example_convnd_fwd_xdl_convscale_bf8_fp8)
 endif()
+
+# WMMA
+if (GPU_TARGETS MATCHES "gfx12")
+    add_custom_target(example_convnd_activ_wmma_convscale)
+    add_example_executable(example_convnd_fwd_wmma_convscale_fp8 convnd_fwd_wmma_convscale_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_fp8)
+
+    add_example_executable(example_convnd_fwd_wmma_convscale_bf8 convnd_fwd_wmma_convscale_bf8.cpp)
+    add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_bf8)
+
+    add_example_executable(example_convnd_fwd_wmma_convscale_fp8_bf8 convnd_fwd_wmma_convscale_fp8_bf8.cpp)
+    add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_fp8_bf8)
+
+    add_example_executable(example_convnd_fwd_wmma_convscale_bf8_fp8 convnd_fwd_wmma_convscale_bf8_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_wmma_convscale example_convnd_fwd_wmma_convscale_bf8_fp8)
+endif()
--- a/example/62_convnd_activ/convscale/convnd_fwd_wmma_convscale_bf8.cpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_wmma_convscale_bf8.cpp
@@ -0,0 +1,98 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "convnd_fwd_convscale_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
+
+using InDataType       = ck::bf8_t;
+using WeiDataType      = ck::bf8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using DsDataType       = ck::Tuple<>;
+using OutDataType      = ck::f8_t;
+using AComputeDataType = InDataType;
+using BComputeDataType = AComputeDataType;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScale;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
+        NDimSpatial,      // NDimSpatial
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        DsLayout,         // DsLayout (empty tuple for ConvScale)
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        DsDataType,       // DsDataType (empty tuple)
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        64,               // BlockSize
+        64,               // MPerBlock
+        64,               // NPerBlock
+        32,               // KPerBlock
+        8,                // AK1
+        8,                // BK1
+        16,               // MPerWmma
+        16,               // NPerWmma
+        4,                // MRepeat
+        2,                // NRepeat
+        S<4, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        1,                // ABlockTransferSrcScalarPerVector
+        8,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        1,                // BBlockTransferSrcScalarPerVector
+        8,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMRepeatPerShuffle
+        1,                // CShuffleNRepeatPerShuffle
+        S<1, 16, 1, 4>,   // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        true,                                      // UseThreadTileTransfer
+        AComputeDataType,                          // AComputeDataType
+        BComputeDataType,                          // BComputeDataType
+        1>;                                        // NumGroupsToMerge
+
+#include "run_convnd_fwd_convscale_example.inc"
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "This kernel support gfx12 only" << std::endl;
+
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/62_convnd_activ/convscale/convnd_fwd_wmma_convscale_bf8_fp8.cpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_wmma_convscale_bf8_fp8.cpp
@@ -0,0 +1,98 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "convnd_fwd_convscale_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
+
+using InDataType       = ck::bf8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using DsDataType       = ck::Tuple<>;
+using OutDataType      = ck::f8_t;
+using AComputeDataType = ck::bf8_t;
+using BComputeDataType = ck::f8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScale;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
+        NDimSpatial,      // NDimSpatial
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        DsLayout,         // DsLayout (empty tuple for ConvScale)
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        DsDataType,       // DsDataType (empty tuple)
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        64,               // BlockSize
+        64,               // MPerBlock
+        64,               // NPerBlock
+        32,               // KPerBlock
+        8,                // AK1
+        8,                // BK1
+        16,               // MPerWmma
+        16,               // NPerWmma
+        4,                // MRepeat
+        2,                // NRepeat
+        S<4, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        1,                // ABlockTransferSrcScalarPerVector
+        8,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        1,                // BBlockTransferSrcScalarPerVector
+        8,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMRepeatPerShuffle
+        1,                // CShuffleNRepeatPerShuffle
+        S<1, 16, 1, 4>,   // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        true,                                      // UseThreadTileTransfer
+        AComputeDataType,                          // AComputeDataType
+        BComputeDataType,                          // BComputeDataType
+        1>;                                        // NumGroupsToMerge
+
+#include "run_convnd_fwd_convscale_example.inc"
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "This kernel support gfx12 only" << std::endl;
+
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/62_convnd_activ/convscale/convnd_fwd_wmma_convscale_fp8.cpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_wmma_convscale_fp8.cpp
@@ -0,0 +1,98 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "convnd_fwd_convscale_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using DsDataType       = ck::Tuple<>;
+using OutDataType      = ck::f8_t;
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScale;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
+        NDimSpatial,      // NDimSpatial
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        DsLayout,         // DsLayout (empty tuple for ConvScale)
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        DsDataType,       // DsDataType (empty tuple)
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        64,               // BlockSize
+        64,               // MPerBlock
+        64,               // NPerBlock
+        32,               // KPerBlock
+        8,                // AK1
+        8,                // BK1
+        16,               // MPerWmma
+        16,               // NPerWmma
+        4,                // MRepeat
+        2,                // NRepeat
+        S<4, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        1,                // ABlockTransferSrcScalarPerVector
+        8,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        1,                // BBlockTransferSrcScalarPerVector
+        8,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMRepeatPerShuffle
+        1,                // CShuffleNRepeatPerShuffle
+        S<1, 16, 1, 4>,   // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        true,                                      // UseThreadTileTransfer
+        AComputeDataType,                          // AComputeDataType
+        BComputeDataType,                          // BComputeDataType
+        1>;                                        // NumGroupsToMerge
+
+#include "run_convnd_fwd_convscale_example.inc"
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "This kernel support gfx12 only" << std::endl;
+
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/62_convnd_activ/convscale/convnd_fwd_wmma_convscale_fp8_bf8.cpp
+++ b/example/62_convnd_activ/convscale/convnd_fwd_wmma_convscale_fp8_bf8.cpp
@@ -0,0 +1,98 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "convnd_fwd_convscale_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::bf8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using DsDataType       = ck::Tuple<>;
+using OutDataType      = ck::f8_t;
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::bf8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScale;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
+        NDimSpatial,      // NDimSpatial
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        DsLayout,         // DsLayout (empty tuple for ConvScale)
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        DsDataType,       // DsDataType (empty tuple)
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        64,               // BlockSize
+        64,               // MPerBlock
+        64,               // NPerBlock
+        32,               // KPerBlock
+        8,                // AK1
+        8,                // BK1
+        16,               // MPerWmma
+        16,               // NPerWmma
+        4,                // MRepeat
+        2,                // NRepeat
+        S<4, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        1,                // ABlockTransferSrcScalarPerVector
+        8,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        1,                // BBlockTransferSrcScalarPerVector
+        8,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMRepeatPerShuffle
+        1,                // CShuffleNRepeatPerShuffle
+        S<1, 16, 1, 4>,   // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        true,                                      // UseThreadTileTransfer
+        AComputeDataType,                          // AComputeDataType
+        BComputeDataType,                          // BComputeDataType
+        1>;                                        // NumGroupsToMerge
+
+#include "run_convnd_fwd_convscale_example.inc"
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "This kernel support gfx12 only" << std::endl;
+
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/62_convnd_activ/convscale_add/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_add/CMakeLists.txt
@@ -5,4 +5,11 @@ if (NOT GPU_TARGETS MATCHES "gfx11")
    add_custom_target(example_convnd_activ_xdl_convscale_add)
    add_example_executable(example_convnd_fwd_xdl_convscale_add_fp8 convnd_fwd_xdl_convscale_add_fp8.cpp)
    add_example_dependencies(example_convnd_activ_xdl_convscale_add example_convnd_fwd_xdl_convscale_add_fp8)
-endif()
+endif()
+
+# WMMA
+if (GPU_TARGETS MATCHES "gfx12")
+    add_custom_target(example_convnd_activ_wmma_convscale_add)
+    add_example_executable(example_convnd_fwd_wmma_convscale_add_fp8 convnd_fwd_wmma_convscale_add_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_wmma_convscale_add example_convnd_fwd_wmma_convscale_add_fp8)
+endif()
--- a/example/62_convnd_activ/convscale_add/convnd_fwd_wmma_convscale_add_fp8.cpp
+++ b/example/62_convnd_activ/convscale_add/convnd_fwd_wmma_convscale_add_fp8.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "ck/utility/tuple.hpp"
+#include "convnd_fwd_convscale_add_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using DsDataType       = float;
+using OutDataType      = ck::f8_t;
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScaleAdd;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
+        NDimSpatial,           // NDimSpatial
+        InLayout,              // ALayout
+        WeiLayout,             // BLayout
+        ck::Tuple<DsLayout>,   // DsLayout
+        OutLayout,             // ELayout
+        InDataType,            // ADataType
+        WeiDataType,           // BDataType
+        AccDataType,           // AccDataType
+        CShuffleDataType,      // CShuffleDataType
+        ck::Tuple<DsDataType>, // DsDataType
+        OutDataType,           // EDataType
+        InElementOp,           // AElementwiseOperation
+        WeiElementOp,          // BElementwiseOperation
+        OutElementOp,          // CDEElementwiseOperation
+        ConvSpec,              // ConvForwardSpecialization
+        GemmSpec,              // GemmSpecialization
+        64,                    // BlockSize
+        64,                    // MPerBlock
+        64,                    // NPerBlock
+        32,                    // KPerBlock
+        8,                     // AK1
+        8,                     // BK1
+        16,                    // MPerWmma
+        16,                    // NPerWmma
+        4,                     // MRepeat
+        2,                     // NRepeat
+        S<4, 16, 1>,           // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,            // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,            // ABlockTransferSrcAccessOrder
+        2,                     // ABlockTransferSrcVectorDim
+        1,                     // ABlockTransferSrcScalarPerVector
+        8,                     // ABlockTransferDstScalarPerVector_AK1
+        1,                     // ABlockLdsExtraM
+        S<4, 16, 1>,           // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,            // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,            // BBlockTransferSrcAccessOrder
+        2,                     // BBlockTransferSrcVectorDim
+        1,                     // BBlockTransferSrcScalarPerVector
+        8,                     // BBlockTransferDstScalarPerVector_BK1
+        1,                     // BBlockLdsExtraN
+        1,                     // CShuffleMRepeatPerShuffle
+        1,                     // CShuffleNRepeatPerShuffle
+        S<1, 16, 1, 4>,        // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1,                     // CDEBlockTransferScalarPerVector_NPerBlock
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        true,                                      // UseThreadTileTransfer
+        AComputeDataType,                          // AComputeDataType
+        BComputeDataType,                          // BComputeDataType
+        1>;                                        // NumGroupsToMerge
+
+#include "run_convnd_fwd_convscale_add_example.inc"
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "This kernel support gfx12 only" << std::endl;
+
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt
@@ -8,4 +8,11 @@ if (NOT GPU_TARGETS MATCHES "gfx11")

    add_example_executable(example_convnd_fwd_xdl_convscale_amax_fp8 convnd_fwd_xdl_convscale_amax_fp8.cpp)
    add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_amax_fp8)
-endif()
+endif()
+
+# WMMA
+if (GPU_TARGETS MATCHES "gfx12")
+    add_custom_target(example_convnd_activ_wmma_convscale_reduce)
+    add_example_executable(example_convnd_fwd_wmma_convscale_amax_fp8 convnd_fwd_wmma_convscale_amax_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_wmma_convscale_reduce example_convnd_fwd_wmma_convscale_amax_fp8)
+endif()
--- a/example/62_convnd_activ/convscale_reduce/convnd_fwd_wmma_convscale_amax_fp8.cpp
+++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_wmma_convscale_amax_fp8.cpp
@@ -0,0 +1,94 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "convnd_fwd_convscale_reduce_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using ConvOutDataType  = float;    // data type of convolution result
+using OutDataType      = ck::f8_t; // data type of final result
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScale;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
+        NDimSpatial,      // NDimSpatial
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        ck::Tuple<>,      // DsLayout
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        ck::Tuple<>,      // DsDataType
+        ConvOutDataType,  // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        64,               // BlockSize
+        64,               // MPerBlock
+        64,               // NPerBlock
+        32,               // KPerBlock
+        8,                // AK1
+        8,                // BK1
+        16,               // MPerWmma
+        16,               // NPerWmma
+        4,                // MRepeat
+        2,                // NRepeat
+        S<4, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        1,                // ABlockTransferSrcScalarPerVector
+        8,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        1,                // BBlockTransferSrcScalarPerVector
+        8,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMRepeatPerShuffle
+        1,                // CShuffleNRepeatPerShuffle
+        S<1, 16, 1, 4>,   // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        true,                                      // UseThreadTileTransfer
+        AComputeDataType,                          // AComputeDataType
+        BComputeDataType,                          // BComputeDataType
+        1>;                                        // NumGroupsToMerge
+
+#include "run_convnd_fwd_example.inc"
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "This kernel support gfx12 only" << std::endl;
+
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/62_convnd_activ/convscale_relu/CMakeLists.txt
+++ b/example/62_convnd_activ/convscale_relu/CMakeLists.txt
@@ -6,3 +6,10 @@ if (NOT GPU_TARGETS MATCHES "gfx11")
    add_example_executable(example_convnd_fwd_xdl_convscale_relu_fp8 convnd_fwd_xdl_convscale_relu_fp8.cpp)
    add_example_dependencies(example_convnd_activ_xdl_convscale_relu example_convnd_fwd_xdl_convscale_relu_fp8)
 endif()
+
+# WMMA
+if (GPU_TARGETS MATCHES "gfx12")
+    add_custom_target(example_convnd_activ_wmma_convscale_relu)
+    add_example_executable(example_convnd_fwd_wmma_convscale_relu_fp8 convnd_fwd_wmma_convscale_relu_fp8.cpp)
+    add_example_dependencies(example_convnd_activ_wmma_convscale_relu example_convnd_fwd_wmma_convscale_relu_fp8)
+endif()
--- a/example/62_convnd_activ/convscale_relu/convnd_fwd_wmma_convscale_relu_fp8.cpp
+++ b/example/62_convnd_activ/convscale_relu/convnd_fwd_wmma_convscale_relu_fp8.cpp
@@ -0,0 +1,98 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "convnd_fwd_convscale_relu_common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
+
+using InDataType       = ck::f8_t;
+using WeiDataType      = ck::f8_t;
+using AccDataType      = float;
+using CShuffleDataType = float;
+using DsDataType       = ck::Tuple<>;
+using OutDataType      = ck::f8_t;
+using AComputeDataType = ck::f8_t;
+using BComputeDataType = ck::f8_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = ConvScaleRelu;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename DsLayout,
+          typename OutLayout>
+using DeviceGroupedConvNDFwdInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
+        NDimSpatial,      // NDimSpatial
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        DsLayout,         // DsLayout (empty tuple for ConvScaleRelu)
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        DsDataType,       // DsDataType (empty tuple)
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        OutElementOp,     // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        64,               // BlockSize
+        64,               // MPerBlock
+        64,               // NPerBlock
+        32,               // KPerBlock
+        8,                // AK1
+        8,                // BK1
+        16,               // MPerWmma
+        16,               // NPerWmma
+        4,                // MRepeat
+        2,                // NRepeat
+        S<4, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        1,                // ABlockTransferSrcScalarPerVector
+        8,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        1,                // BBlockTransferSrcScalarPerVector
+        8,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMRepeatPerShuffle
+        1,                // CShuffleNRepeatPerShuffle
+        S<1, 16, 1, 4>,   // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        true,                                      // UseThreadTileTransfer
+        AComputeDataType,                          // AComputeDataType
+        BComputeDataType,                          // BComputeDataType
+        1>;                                        // NumGroupsToMerge
+
+#include "run_convnd_fwd_convscale_relu_example.inc"
+
+int main(int argc, char* argv[])
+{
+    if(!ck::is_gfx12_supported())
+    {
+        std::cout << "This kernel support gfx12 only" << std::endl;
+
+        return 0;
+    }
+    return run_convnd_fwd_example(argc, argv) ? 0 : 1;
+}
--- a/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
+++ b/example/62_convnd_activ/dynamic_unary/CMakeLists.txt
@@ -37,4 +37,10 @@ add_example_executable(example_convnd_fwd_xdl_dynamic_passthrough_fp16 convnd_fw
 add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_passthrough_fp16)
 # Logistic
 add_example_executable(example_convnd_fwd_xdl_dynamic_logistic_fp16 convnd_fwd_xdl_dynamic_logistic_fp16.cpp)
-add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_logistic_fp16)
+add_example_dependencies(example_convnd_activ_dynamic_unary_xdl example_convnd_fwd_xdl_dynamic_logistic_fp16)
+
+# WMMA
+add_custom_target(example_convnd_activ_dynamic_unary_wmma)
+# PassThrough
+add_example_executable(example_convnd_fwd_wmma_dynamic_passthrough_fp16 convnd_fwd_wmma_dynamic_passthrough_fp16.cpp)
+add_example_dependencies(example_convnd_activ_dynamic_unary_wmma example_convnd_fwd_wmma_dynamic_passthrough_fp16)
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_wmma_common.hpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_activ_dynamic_unary_wmma_common.hpp
@@ -0,0 +1,245 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include <cstdlib>
+#include <iostream>
+#include <numeric>
+#include <type_traits>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_wmma_cshuffle_v3.hpp"
+
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+constexpr ck::index_t NDimSpatial = 3;
+using InDataType                  = ck::half_t;
+using WeiDataType                 = ck::half_t;
+using AccDataType                 = float;
+using CShuffleDataType            = ck::half_t;
+using OutDataType                 = ck::half_t;
+using AComputeDataType            = ck::half_t;
+using BComputeDataType            = ck::half_t;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+// Use correct tensor layouts for WMMA (matching working tests)
+using InLayout  = ck::tensor_layout::convolution::NDHWGC;
+using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
+using OutLayout = ck::tensor_layout::convolution::NDHWGK;
+
+using InElementOp      = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp     = ck::tensor_operation::element_wise::PassThrough;
+using DynamicElementOp = ck::tensor_operation::element_wise::DynamicUnaryOp;
+
+static constexpr auto ConvSpec =
+    ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
+
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
+
+using DeviceGroupedConvNDActivInstance =
+    ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<
+        NDimSpatial,      // NDimSpatial
+        InLayout,         // ALayout
+        WeiLayout,        // BLayout
+        ck::Tuple<>,      // DsLayout
+        OutLayout,        // ELayout
+        InDataType,       // ADataType
+        WeiDataType,      // BDataType
+        AccDataType,      // AccDataType
+        CShuffleDataType, // CShuffleDataType
+        ck::Tuple<>,      // DsDataType
+        OutDataType,      // EDataType
+        InElementOp,      // AElementwiseOperation
+        WeiElementOp,     // BElementwiseOperation
+        DynamicElementOp, // CDEElementwiseOperation
+        ConvSpec,         // ConvForwardSpecialization
+        GemmSpec,         // GemmSpecialization
+        64,               // BlockSize
+        64,               // MPerBlock
+        64,               // NPerBlock
+        32,               // KPerBlock
+        8,                // AK1
+        8,                // BK1
+        16,               // MPerWmma
+        16,               // NPerWmma
+        4,                // MRepeat
+        2,                // NRepeat
+        S<4, 16, 1>,      // ABlockTransferThreadClusterLengths_AK0_M_AK1
+        S<1, 0, 2>,       // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // ABlockTransferSrcAccessOrder
+        2,                // ABlockTransferSrcVectorDim
+        1,                // ABlockTransferSrcScalarPerVector
+        8,                // ABlockTransferDstScalarPerVector_AK1
+        1,                // ABlockLdsExtraM
+        S<4, 16, 1>,      // BBlockTransferThreadClusterLengths_BK0_N_BK1
+        S<1, 0, 2>,       // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,       // BBlockTransferSrcAccessOrder
+        2,                // BBlockTransferSrcVectorDim
+        1,                // BBlockTransferSrcScalarPerVector
+        8,                // BBlockTransferDstScalarPerVector_BK1
+        1,                // BBlockLdsExtraN
+        1,                // CShuffleMRepeatPerShuffle
+        1,                // CShuffleNRepeatPerShuffle
+        S<1, 16, 1, 4>,   // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        1,                // CDEBlockTransferScalarPerVector_NPerBlock
+        ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+        ck::BlockGemmPipelineVersion::v1,          // BlkGemmPipelineVer
+        true,                                      // UseThreadTileTransfer
+        AComputeDataType,                          // AComputeDataType
+        BComputeDataType,                          // BComputeDataType
+        1>;                                        // NumGroupsToMerge
+
+template <ck::index_t NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp,
+          typename DeviceConvNDFwdInstance>
+bool run_grouped_conv(bool do_verification,
+                      int init_method,
+                      bool time_kernel,
+                      const ck::utils::conv::ConvParam& conv_param,
+                      const ck::HostTensorDescriptor& in_g_n_c_wis_desc,
+                      const ck::HostTensorDescriptor& wei_g_k_c_xs_desc,
+                      const ck::HostTensorDescriptor& out_g_n_k_wos_desc,
+                      const InElementOp& in_element_op,
+                      const WeiElementOp& wei_element_op,
+                      const OutElementOp& out_element_op)
+{
+    ck::Tensor<InDataType> in(in_g_n_c_wis_desc);
+    ck::Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
+    ck::Tensor<OutDataType> out_host(out_g_n_k_wos_desc);
+    ck::Tensor<OutDataType> out_device(out_g_n_k_wos_desc);
+
+    std::cout << "in: " << in.mDesc << std::endl;
+    std::cout << "wei: " << wei.mDesc << std::endl;
+    std::cout << "out: " << out_host.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in.GenerateTensorValue(GeneratorTensor_2<InDataType>{-2, 2});
+        wei.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-2, 2});
+        break;
+    default:
+        in.GenerateTensorValue(GeneratorTensor_3<InDataType>{-1.0, 1.0});
+        wei.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.05, 0.05});
+    }
+
+    ck::DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize());
+    ck::DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize());
+    ck::DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize());
+
+    in_device_buf.ToDevice(in.mData.data());
+    wei_device_buf.ToDevice(wei.mData.data());
+
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> a_g_n_c_wis_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> b_g_k_c_xs_strides{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_lengths{};
+    std::array<ck::index_t, NDimSpatial + 3> e_g_n_k_wos_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+    std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+    std::array<ck::index_t, NDimSpatial> input_left_pads{};
+    std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+    auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); };
+
+    copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths);
+    copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides);
+    copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths);
+    copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides);
+    copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths);
+    copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides);
+    copy(conv_param.conv_filter_strides_, conv_filter_strides);
+    copy(conv_param.conv_filter_dilations_, conv_filter_dilations);
+    copy(conv_param.input_left_pads_, input_left_pads);
+    copy(conv_param.input_right_pads_, input_right_pads);
+
+    // do Conv
+    auto conv     = DeviceConvNDFwdInstance{};
+    auto invoker  = conv.MakeInvoker();
+    auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(),
+                                      wei_device_buf.GetDeviceBuffer(),
+                                      std::array<const void*, 0>{},
+                                      out_device_buf.GetDeviceBuffer(),
+                                      a_g_n_c_wis_lengths,
+                                      a_g_n_c_wis_strides,
+                                      b_g_k_c_xs_lengths,
+                                      b_g_k_c_xs_strides,
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      std::array<std::array<ck::index_t, NDimSpatial + 3>, 0>{{}},
+                                      e_g_n_k_wos_lengths,
+                                      e_g_n_k_wos_strides,
+                                      conv_filter_strides,
+                                      conv_filter_dilations,
+                                      input_left_pads,
+                                      input_right_pads,
+                                      in_element_op,
+                                      wei_element_op,
+                                      out_element_op);
+
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("The device op with the specified compilation parameters does "
+                                 "not support this convolution problem.");
+    }
+
+    float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = conv_param.GetFlops();
+    std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
+
+    float tflops     = static_cast<float>(flop) / 1.E9 / avg_time;
+    float gb_per_sec = num_btype / 1.E6 / avg_time;
+    std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << conv.GetTypeString() << std::endl;
+
+    if(do_verification)
+    {
+        auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd<NDimSpatial,
+                                                                     InDataType,
+                                                                     WeiDataType,
+                                                                     OutDataType,
+                                                                     InElementOp,
+                                                                     WeiElementOp,
+                                                                     OutElementOp>();
+
+        auto ref_invoker  = ref_conv.MakeInvoker();
+        auto ref_argument = ref_conv.MakeArgument(in,
+                                                  wei,
+                                                  out_host,
+                                                  conv_param.conv_filter_strides_,
+                                                  conv_param.conv_filter_dilations_,
+                                                  conv_param.input_left_pads_,
+                                                  conv_param.input_right_pads_,
+                                                  in_element_op,
+                                                  wei_element_op,
+                                                  out_element_op);
+
+        ref_invoker.Run(ref_argument);
+
+        out_device_buf.FromDevice(out_device.mData.data());
+
+        return ck::utils::check_err(out_device, out_host, "Error: incorrect results!", 1e-3, 0.1);
+    }
+
+    return true;
+}
--- a/example/62_convnd_activ/dynamic_unary/convnd_fwd_wmma_dynamic_passthrough_fp16.cpp
+++ b/example/62_convnd_activ/dynamic_unary/convnd_fwd_wmma_dynamic_passthrough_fp16.cpp
@@ -0,0 +1,12 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "convnd_fwd_activ_dynamic_unary_wmma_common.hpp"
+
+#include "../run_convnd_activ_dynamic_example.inc"
+
+int main(int argc, char* argv[])
+{
+    ck::tensor_operation::element_wise::PassThrough out_element_op;
+    return !run_convnd_example(argc, argv, out_element_op);
+}
--- a/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc
+++ b/example/62_convnd_activ/run_convnd_activ_dynamic_example.inc
@@ -47,6 +47,12 @@ bool run_convnd_example(int argc, char* argv[], const OutElementOp& out_element_
        conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv);
    }

+    if(std::is_same_v<OutElementOp, ck::tensor_operation::element_wise::SoftRelu> &&
+       init_method != 2)
+    {
+        std::cout << "Running SoftRelu op with int initialization. Risk of overflow.\n\n";
+    }
+
    const auto in_element_op  = InElementOp{};
    const auto wei_element_op = WeiElementOp{};