Merge branch 'develop' into zan_fix_bufferloadlds

2026-06-29 11:16:59 +00:00 · 2025-06-27 14:33:46 +08:00
parent 87bd161eec e1c5172fdb
commit ed0375bb4d
122 changed files with 6086 additions and 2232 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -68,3 +68,6 @@ build*/

 # Python cache
 __pycache__/
+
+.cache/
+
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -18,9 +18,11 @@ Documentation for Composable Kernel available at [https://rocm.docs.amd.com/proj
 * Added support for FP16 2:4 structured sparsity to universal GEMM.
 * Added support for Split K for grouped convolution backward data.
 * Added logit soft-capping support for fMHA forward kernels.
+* Added support for hdim as a multiple of 32 for FMHA (fwd/fwd_splitkv)
 * Added benchmarking support for tile engine GEMM.
 * Added Ping-pong scheduler support for GEMM operation along the K dimension.
 * Added rotating buffer feature for CK_Tile GEMM.
+* Added int8 support for CK_TILE GEMM.

 ### Optimized

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -634,7 +634,7 @@ option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)

 add_subdirectory(library)

-if(NOT GPU_ARCHS AND USER_GPU_TARGETS)
+if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
   rocm_package_setup_component(tests
        LIBRARY_NAME composablekernel
        PACKAGE_NAME tests # Prevent -static suffix on package name
--- a/46
+++ b/46
@@ -225,6 +225,10 @@ def cmake_build(Map conf=[:]){
    def build_envs = "CTEST_PARALLEL_LEVEL=4 " + conf.get("build_env","")
    def prefixpath = conf.get("prefixpath","/opt/rocm")
    def setup_args = conf.get("setup_args","")
+    // make sure all unit tests always run on develop branch
+    if(env.BRANCH_NAME == "develop"){
+        params.RUN_ALL_UNIT_TESTS = true
+    }

    if (prefixpath != "/usr/local"){
        setup_args = setup_args + " -DCMAKE_PREFIX_PATH=${prefixpath} "
@@ -796,7 +800,7 @@ def process_results(Map conf=[:]){
 }

 //launch develop branch daily jobs
-CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=false;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
+CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true;DISABLE_DL_KERNELS=true;RUN_CK_TILE_FMHA_TESTS=true;RUN_CK_TILE_TRANSPOSE_TESTS=true;RUN_CK_TILE_GEMM_TESTS=true;RUN_TILE_ENGINE_GEMM_TESTS=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                              0 21 * * * % RUN_GROUPED_CONV_LARGE_CASES_TESTS=true;hipTensor_test=true;BUILD_GFX908=true;BUILD_GFX950=true;RUN_PERFORMANCE_TESTS=true;RUN_ALL_UNIT_TESTS=true
                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-staging;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
                                              0 17 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-mainline;BUILD_COMPILER=/llvm-project/build/bin/clang++;USE_SCCACHE=false;NINJA_BUILD_TRACE=true;RUN_ALL_UNIT_TESTS=true
@@ -896,10 +900,26 @@ pipeline {
            name: "BUILD_GFX908",
            defaultValue: false,
            description: "Build CK and run tests on gfx908 (default: OFF)")
+        booleanParam(
+            name: "BUILD_GFX90A",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx90a (default: ON)")
+        booleanParam(
+            name: "BUILD_GFX942",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx942 (default: ON)")
        booleanParam(
            name: "BUILD_GFX950",
            defaultValue: false,
            description: "Build CK and run tests on gfx950 (default: OFF)")
+        booleanParam(
+            name: "BUILD_GFX10",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx10 (default: ON)")
+        booleanParam(
+            name: "BUILD_GFX11",
+            defaultValue: true,
+            description: "Build CK and run tests on gfx11 (default: ON)")
        booleanParam(
            name: "BUILD_GFX12",
            defaultValue: true,
@@ -1196,9 +1216,12 @@ pipeline {
                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -D CMAKE_BUILD_TYPE=Release \
                                            -D GPU_TARGETS="gfx90a" \
+                                            -D GEMM_DATATYPE="fp8;fp16" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j64 benchmark_gemm && \
-                                           ./bin/benchmark_gemm """
+                                           ninja -j64 benchmark_gemm_fp8 && \
+                                           ./bin/benchmark_gemm_fp8 && \
+                                           ninja -j64 benchmark_gemm_fp16 && \
+                                           ./bin/benchmark_gemm_fp16 """
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1218,9 +1241,12 @@ pipeline {
                                            -D CMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -D CMAKE_BUILD_TYPE=Release \
                                            -D GPU_TARGETS="gfx942" \
+                                            -D GEMM_DATATYPE="fp8;fp16" \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && \
-                                           ninja -j128 benchmark_gemm && \
-                                           ./bin/benchmark_gemm """
+                                           ninja -j128 benchmark_gemm_fp8 && \
+                                           ./bin/benchmark_gemm_fp8 && \
+                                           ninja -j128 benchmark_gemm_fp16 && \
+                                           ./bin/benchmark_gemm_fp16 """
                    }
                    steps{
                        buildHipClangJobAndReboot(setup_args:setup_args, no_reboot:true, build_type: 'Release', execute_cmd: execute_args)
@@ -1276,7 +1302,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { params.RUN_FULL_QA.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { (params.BUILD_GFX942.toBoolean() || params.RUN_FULL_QA.toBoolean()) && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx942") }
                    environment{
@@ -1314,7 +1340,7 @@ pipeline {
                                           -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
                    }
                    steps{
-                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "rocm/composable_kernel-private:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
+                        Build_CK_and_Reboot(setup_args: setup_args, docker_name: "${env.CK_DOCKERHUB_PRIVATE}:ck_ub22.04_rocm7.0", config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
                        cleanWs()
                    }
                }
@@ -1343,7 +1369,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX90A.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx90a") }
                    environment{
@@ -1382,7 +1408,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX10.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx1030") }
                    environment{
@@ -1403,7 +1429,7 @@ pipeline {
                {
                    when {
                        beforeAgent true
-                        expression { !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
+                        expression { params.BUILD_GFX11.toBoolean() && !params.RUN_FULL_QA.toBoolean() && !params.BUILD_INSTANCES_ONLY.toBoolean() && !params.BUILD_LEGACY_OS.toBoolean() }
                    }
                    agent{ label rocmnode("gfx1101") }
                    environment{
--- a/client_example/32_gemm_mx/CMakeLists.txt
+++ b/client_example/32_gemm_mx/CMakeLists.txt
@@ -0,0 +1,4 @@
+if(GPU_TARGETS MATCHES "gfx950")
+	add_executable(client_gemm_mx_fp8 gemm_mx_fp8.cpp)
+	target_link_libraries(client_gemm_mx_fp8 PRIVATE composable_kernel::device_gemm_operations)
+endif()
--- a/client_example/32_gemm_mx/gemm_mx_fp8.cpp
+++ b/client_example/32_gemm_mx/gemm_mx_fp8.cpp
@@ -0,0 +1,330 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+#include <iomanip>
+#include <vector>
+#include <iostream>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/utility/data_type.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_mx.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_mx.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3_mx.hpp"
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using AElementOp = PassThrough;
+using BElementOp = PassThrough;
+using CElementOp = PassThrough;
+
+using ADataType = ck::f8_t;
+using BDataType = ck::f8_t;
+using CDataType = ck::half_t;
+
+using XDataType       = ck::e8m0_bexp_t;
+using XPackedDataType = int32_t;
+template <typename X, typename Y>
+inline constexpr bool is_same_v = ck::is_same<X, Y>::value;
+
+using ALayout = Row;
+using BLayout = Col;
+using CLayout = Row;
+
+using AScaleLayout = Row;
+using BScaleLayout = Col;
+
+template <bool KLast>
+void preShuffleScaleBuffer(ck::e8m0_bexp_t* src, ck::e8m0_bexp_t* dst, int MN, int K)
+{
+    int MNXdlPack = 2;
+    int KXdlPack  = 2;
+
+    int XdlMNThread = 16;
+    int XdlKThread  = 64 / XdlMNThread;
+
+    int K0 = K / KXdlPack / XdlKThread; // KRepeat
+
+    // The 4 16x128 building blocks will be packed into 1 32x256 for F4
+    // The 8 16x16x128 mfma will be packed into 1 32x32x256 for F4
+
+    // unfold the MN32xK(256/32) scale buffer
+    //    4            16             2           2
+    // To XdlKThread-> XdlMNThread -> KXdlPack -> MNXdlPack
+    // Then, MNRepeat->KRepeat
+
+    for(int n = 0; n < MN; ++n)
+    {
+        for(int k = 0; k < K; ++k)
+        {
+            int n0    = n / (XdlMNThread * MNXdlPack); // i MNRepeat
+            int tempn = n % (XdlMNThread * MNXdlPack);
+            int n1    = tempn % XdlMNThread; // i XdlMNThread
+            int n2    = tempn / XdlMNThread; // i MNXdlPack
+
+            int k0    = k / (XdlKThread * KXdlPack); // i KRepeat
+            int tempk = k % (XdlKThread * KXdlPack);
+            int k1    = tempk % XdlKThread; // i XdlKThread
+            int k2    = tempk / XdlKThread; // i KXdlPack
+
+            int outputIndex = n0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread * K0 +
+                              k0 * MNXdlPack * KXdlPack * XdlMNThread * XdlKThread +
+                              k1 * MNXdlPack * KXdlPack * XdlMNThread + n1 * MNXdlPack * KXdlPack +
+                              k2 * MNXdlPack + n2;
+            // src[n * K + k] = ck::type_convert<ck::e8m0_bexp_t>(static_cast<float>(powf(2.0f, n2 +
+            // k2 * MNXdlPack)));
+            if constexpr(KLast)
+                dst[outputIndex] = src[n * K + k];
+            else
+                dst[outputIndex] = src[k * MN + n];
+        }
+    }
+}
+
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        mem_size_ = mem_size;
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+
+    void* GetDeviceBuffer() { return p_mem_; }
+
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+
+    void* p_mem_;
+    std::size_t mem_size_;
+};
+
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+
+    ck::index_t KBatch = 1;
+
+    /* Require by mx type*/
+    constexpr ck::index_t ScaleBlockSize = 32; // scaling block size
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 7)
+    {
+        M = std::stoi(argv[1]);
+        N = std::stoi(argv[2]);
+        K = std::stoi(argv[3]);
+
+        StrideA = std::stoi(argv[4]);
+        StrideB = std::stoi(argv[5]);
+        StrideC = std::stoi(argv[6]);
+    }
+    else
+    {
+        printf("arg1 to 6: M, N, K, StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+
+            if constexpr(std::is_same<Layout, Row>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+
+    /* Scale stride Calculation */
+    auto f_get_default_stride =
+        [](ck::index_t row, ck::index_t col, ck::index_t stride, auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                    return static_cast<ck::index_t>(col);
+                else
+                    return static_cast<ck::index_t>(row);
+            }
+            else
+                return static_cast<ck::index_t>(stride);
+        };
+
+    if(K % ScaleBlockSize != 0)
+    {
+        throw std::runtime_error("wrong! K must be multiple of ScaleBlockSize.");
+    };
+    auto Scale_Padded_M = (M + ScaleBlockSize - 1) / ScaleBlockSize * ScaleBlockSize;
+    auto Scale_Stride_AM =
+        f_get_default_stride(Scale_Padded_M, K / ScaleBlockSize, -1, AScaleLayout{});
+    auto Scale_Stride_BN = f_get_default_stride(K / ScaleBlockSize, N, -1, BScaleLayout{});
+
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * f_matrix_space_size(M, N, StrideC, CLayout{}));
+    SimpleDeviceMem a_scale_device_buf(
+        sizeof(XDataType) *
+        f_matrix_space_size(Scale_Padded_M, K / ScaleBlockSize, Scale_Stride_AM, AScaleLayout{}));
+    SimpleDeviceMem b_scale_device_buf(
+        sizeof(XDataType) *
+        f_matrix_space_size(K / ScaleBlockSize, N, Scale_Stride_BN, BScaleLayout{}));
+
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceGemmMX<ALayout,
+                                                   BLayout,
+                                                   CLayout,
+                                                   ADataType,
+                                                   XPackedDataType,
+                                                   BDataType,
+                                                   XPackedDataType,
+                                                   CDataType,
+                                                   ScaleBlockSize,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough,
+                                                   ck::tensor_operation::element_wise::PassThrough>;
+
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+
+    const auto a_element_op = AElementOp{};
+    const auto b_element_op = BElementOp{};
+    const auto c_element_op = CElementOp{};
+
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+            static_cast<XPackedDataType*>(a_scale_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+            static_cast<XPackedDataType*>(b_scale_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            StrideA,
+            Scale_Stride_AM,
+            StrideB,
+            Scale_Stride_BN,
+            StrideC,
+            KBatch,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        std::string op_name = op_ptr->GetTypeString();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+
+            std::size_t flop =
+                std::size_t(2) * M * N * K + std::size_t(2) * M * N * K / ScaleBlockSize;
+
+            std::size_t num_btype = sizeof(ADataType) * M * K / ck::packed_size_v<ADataType> +
+                                    sizeof(BDataType) * K * N / ck::packed_size_v<BDataType> +
+                                    sizeof(CDataType) * M * N +
+                                    sizeof(XDataType) * M * K / ScaleBlockSize +
+                                    sizeof(XDataType) * N * K / ScaleBlockSize;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+            if(tflops > best_tflops)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+
+    // run the best intance
+    if(found)
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+            static_cast<XPackedDataType*>(a_scale_device_buf.GetDeviceBuffer()),
+            static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+            static_cast<XPackedDataType*>(b_scale_device_buf.GetDeviceBuffer()),
+            static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+            M,
+            N,
+            K,
+            StrideA,
+            Scale_Stride_AM,
+            StrideB,
+            Scale_Stride_BN,
+            StrideC,
+            KBatch,
+            a_element_op,
+            b_element_op,
+            c_element_op);
+
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+
+        std::cout << "Done" << std::endl;
+    }
+
+    return 0;
+}
--- a/client_example/README.md
+++ b/client_example/README.md
@@ -14,8 +14,10 @@ cd client_example/build
 cmake                                                                 \
 -D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                             \
 -D CMAKE_PREFIX_PATH="/opt/rocm;${PATH_TO_CK_INSTALL_DIRECTORY}"      \
+-D GPU_TARGETS="gfx908;gfx90a"                                        \
 ..
 ```
+You must set the `GPU_TARGETS` macro to specify the GPU target architecture(s).

 ### Build client example
 ```bash
--- a/cmake/ShardInstantiation.cmake
+++ b/cmake/ShardInstantiation.cmake
@@ -0,0 +1,116 @@
+# Function to generate templated instantiation functions and caller function.
+
+# In order to reduce build times, we split the instantiation of template functions into multiple files.
+# Developers can use ck::util::generate_sharded_instantiations to generate the instantiation functions,
+# which can be placed the TEMPLATE_FILE (typically a .in file).
+
+# This CMake function generates the instantiation functions and a caller function that calls all the instantiation 
+# functions. The ck::util::generate_sharded_instantiations function allows us to generate an arbitrary number of
+# shards (NUM_SHARDS). This function loops over the shards, generates an instantiation function for each shard,
+# and generates a caller function that calls all the instantiation functions.
+
+# The explicit instatiation pattern requires the use of `extern template` to avoid implicit instantiation
+# of the template functions in the caller function, and that code is automatically generated by this function.
+
+# In addition to the user-supplied template, this CMake function uses two generic templates:
+#
+# 1. `instantiate_shard.in`: This is the template for the instantiation functions.
+# 2. `call_shard.in`: This is the template for the caller function that calls all the instantiation functions.
+
+# This function takes the following arguments:
+#
+# - INSTANCES_NAME: The name of the instances (the calling function will be named `add_${INSTANCE_NAMES}`).
+# - TEMPLATE_FILE: The path to the template file that contains the templated instantiation function definitions.
+# - NUM_SHARDS: The number of shards to generate.
+# - OUTPUT_DIR: The build directory where the generated source files will be placed.
+# - SRC_LIST: The list of source files to which the generated source files will be added.
+
+
+function(generate_sharded_instantiations)
+    cmake_parse_arguments(
+        GEN_SHARDED
+        # No boolean arguments
+        ""
+        # Single-value arguments
+        "INSTANCES_NAME;TEMPLATE_FILE;NUM_SHARDS;OUTPUT_DIR;SRC_LIST"
+        # No multi-value arguments.
+        ""
+        ${ARGN}
+    )
+    if (NOT GEN_SHARDED_INSTANCES_NAME)
+        message(FATAL_ERROR "INSTANCES_NAME is required for generate_sharded_instantiations")
+    endif()
+    if (NOT GEN_SHARDED_TEMPLATE_FILE)
+        message(FATAL_ERROR "TEMPLATE_FILE is required for generate_sharded_instantiations")
+    endif()
+    if (NOT GEN_SHARDED_NUM_SHARDS)
+        message(FATAL_ERROR "NUM_SHARDS is required for generate_sharded_instantiations")
+    endif()
+    if(NOT GEN_SHARDED_OUTPUT_DIR)
+        message(FATAL_ERROR "OUTPUT_DIR is required for generate_sharded_instantiations")
+    endif()
+    if (NOT GEN_SHARDED_SRC_LIST)
+        message(FATAL_ERROR "SRC_LIST is required for generate_sharded_instantiations")
+    endif()
+
+    file(MAKE_DIRECTORY ${GEN_SHARDED_OUTPUT_DIR})
+
+
+    set(GENERATED_SOURCE_FILES "")
+    set(EXTERN_TEMPLATE_STATEMENTS "")
+    set(CALL_STATEMENTS "")
+    message(STATUS "Generating sharded instantiations for target: ${GEN_SHARDED_INSTANCES_NAME}")
+
+    set(INSTANCES "${GEN_SHARDED_INSTANCES_NAME}")
+    
+    # Generate the inc file with the template function defintions.
+    # This include file will hold the template function definitions and a using alias for all the shard
+    # instantiation functions.
+    configure_file(
+        "${GEN_SHARDED_TEMPLATE_FILE}"
+        "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.inc"
+        @ONLY
+    )
+
+    # Generate the sharded instantiation functions.
+    # This is where the build parallelization happens.
+    # Each of these source files will contain a single instantiation function for a shard,
+    # which will be called sequentially by the caller function.
+    set(INC_DIR "${GEN_SHARDED_INC_DIR}")
+    math(EXPR LAST_SHARD_ID "${GEN_SHARDED_NUM_SHARDS} - 1")
+    foreach(SHARD_ID RANGE 0 ${LAST_SHARD_ID})
+        set(NUM_SHARDS "${GEN_SHARDED_NUM_SHARDS}")
+        set(SHARD_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}_shard_${SHARD_ID}.cpp")
+        set(SHARD_FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/instantiate_shard.in")
+        configure_file(
+            "${SHARD_FUNCTION_TEMPLATE}"
+            "${SHARD_FUNCTION_PATH}"
+            @ONLY
+        )
+        list(APPEND GENERATED_SOURCE_FILES "${SHARD_FUNCTION_PATH}")
+        set(SHARDED_FUNCTION_NAME "add_${INSTANCES}_shard<${NUM_SHARDS}, ${SHARD_ID}>")
+        list(APPEND EXTERN_TEMPLATE_STATEMENTS "extern template void\n${SHARDED_FUNCTION_NAME}(\n  ${INSTANCES}& instances)")
+        list(APPEND CALL_STATEMENTS "  ${SHARDED_FUNCTION_NAME}(instances)")
+    endforeach()
+
+    # Join the include statements, the extern template declarations, and the call statements each
+    # into a single string for variable substitution in the caller function.
+    string(REPLACE ";" ";\n" INCLUDE_STATEMENTS "${INCLUDE_STATEMENTS}")
+    string(REPLACE ";" ";\n" CALL_STATEMENTS "${CALL_STATEMENTS}")
+    string(REPLACE ";" ";\n" EXTERN_TEMPLATE_STATEMENTS "${EXTERN_TEMPLATE_STATEMENTS}")
+
+    # Generate the caller function.
+    set(CALLER_FUNCTION_PATH "${GEN_SHARDED_OUTPUT_DIR}/${INSTANCES}.cpp")
+    set(FUNCTION_TEMPLATE "${PROJECT_SOURCE_DIR}/cmake/call_shard.in")
+    configure_file(
+        "${FUNCTION_TEMPLATE}"
+        "${CALLER_FUNCTION_PATH}"
+        @ONLY
+    )
+    list(APPEND GENERATED_SOURCE_FILES "${CALLER_FUNCTION_PATH}")
+
+    # Add the generated source files to the list of source files.
+    # This allows the generated source files to be included in the build.
+    list(APPEND ${GEN_SHARDED_SRC_LIST} ${GENERATED_SOURCE_FILES})
+    set(${GEN_SHARDED_SRC_LIST} "${${GEN_SHARDED_SRC_LIST}}" PARENT_SCOPE)
+endfunction()
--- a/cmake/call_shard.in
+++ b/cmake/call_shard.in
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "@INSTANCES@.inc"
+
+namespace ck::tensor_operation::device::instance {
+
+@EXTERN_TEMPLATE_STATEMENTS@;
+
+void add_@INSTANCES@(
+    @INSTANCES@& instances) {
+@CALL_STATEMENTS@; 
+}
+
+} // namespace ck::tensor_operation::device::instance
--- a/cmake/instantiate_shard.in
+++ b/cmake/instantiate_shard.in
@@ -0,0 +1,9 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "@INSTANCES@.inc"
+
+namespace ck::tensor_operation::device::instance {
+template void add_@INSTANCES@_shard<@NUM_SHARDS@, @SHARD_ID@>(
+    @INSTANCES@& instances);
+} // namespace ck::tensor_operation::device::instance
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -139,6 +139,7 @@ static constexpr ck::index_t EVec          = 2;
 static constexpr ck::index_t D0Vec         = 1;
 static constexpr ck::index_t D1Vec         = 1;
 static constexpr ck::index_t D2Vec         = 1;
+static constexpr bool PerTokenQuant        = true;
 static constexpr bool MulRoutedWeight      = true;
 using DeviceOpInstance                     = ck::tensor_operation::device::DeviceMoeGemm
    // clang-format off
@@ -169,7 +170,7 @@ using DeviceOpInstance                     = ck::tensor_operation::device::Devic
               //    MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
                //  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
               2,        2,         S<1, CShuffleMLane, 1, CShuffleNLane>, S<EVec, D0Vec, D1Vec, D2Vec>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, MulRoutedWeight, false, int32_t, A0DataType>;
+               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, 0, false, false, MulRoutedWeight, PerTokenQuant, int32_t, A0DataType>;
        // kernel 2: 128->32x128x128
        //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, EDataType>;

@@ -197,7 +198,7 @@ int main(int argc, char* argv[])
    {
        // use default case
    }
-    else if(argc == 3)
+    else if(argc == 4)
    {
        // use default case
        do_verification = std::stoi(argv[1]);
@@ -238,7 +239,8 @@ int main(int argc, char* argv[])
    ck::index_t StrideB              = K;
    ck::index_t StrideE              = N;
    constexpr ck::index_t NumDTensor = DsDataType::Size();
-    constexpr auto StrideDs          = std::array<ck::index_t, NumDTensor>{0, 0, 0};
+    constexpr auto StrideDs          = PerTokenQuant ? std::array<ck::index_t, NumDTensor>{1, 1, 0}
+                                                     : std::array<ck::index_t, NumDTensor>{0, 0, 0};

    ck::index_t KBatch = 1;

@@ -279,8 +281,10 @@ int main(int argc, char* argv[])
    Tensor<A0DataType> a0_t_k_k(HostTensorDescriptor({tokens, topk, K}, {topk * K, K, 1}));
    Tensor<B0DataType> b0_e_n_k(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
    Tensor<B0DataType> b0_preshuffled(HostTensorDescriptor({experts, K, N}, {N * K, 1, K}));
-    Tensor<D0DataType> d0_t_n(HostTensorDescriptor({tokens, N}, {StrideDs[0], 0}));
-    Tensor<D1DataType> d1_e_n(HostTensorDescriptor({experts, N}, {1, StrideDs[1]}));
+    Tensor<D0DataType> d0_t_n(
+        HostTensorDescriptor({tokens, topk, N}, {StrideDs[0] * topk, StrideDs[0], 0}));
+    Tensor<D1DataType> d1_e_n(
+        HostTensorDescriptor({experts, N}, {PerTokenQuant ? StrideDs[1] * N : 1, StrideDs[1]}));
    Tensor<D2DataType> d2_e_n(HostTensorDescriptor({sorted_size, N}, {1, 0}));
    Tensor<EDataType> e_t_n_host_result(HostTensorDescriptor({tokens, N}, {N, 1}));
    Tensor<EDataType> e_t_n_device_result(HostTensorDescriptor({tokens, N}, {N, 1}));
--- a/example/ck_tile/01_fmha/README.md
+++ b/example/ck_tile/01_fmha/README.md
@@ -71,6 +71,7 @@ args:
  -drop_seed    seed for random number generator (default:1)
 -drop_offset    offset for random number generator (default:0)
 -drop_prefs    seed and offset values are present on GPU; 0 - host, 1 - device/GPU (default:0)
+ -num_splits    number of splits for key/value. 0 to determine actual number by heuristic (default:1)
     -warmup    number of iterations before benchmark the kernel (default:5)
     -repeat    number of iterations to benchmark the kernel (default:20)
 ```
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -282,18 +282,19 @@ class FmhaFwdApiPool:
        # TODO: do we need to check duplication?
        if trait.dtype not in self.pool.keys():
            self.pool[trait.dtype] = dict()
-        if trait.hdim not in self.pool[trait.dtype].keys():
-            self.pool[trait.dtype][trait.hdim] = list()
+        hdim = trait.hdim, trait.bn1
+        if hdim not in self.pool[trait.dtype].keys():
+            self.pool[trait.dtype][hdim] = list()

-        self.pool[trait.dtype][trait.hdim].append(copy.copy(trait))
+        self.pool[trait.dtype][hdim].append(copy.copy(trait))

    @property
    def api(self) -> str:
        per_dtypes=str()
        for i, dtype in enumerate(self.pool.keys()):
            per_hdim_case=str()
-            for j, hdim in enumerate(self.pool[dtype].keys()):
-                traits=self.pool[dtype][hdim]
+            for j, (hdim, hdim_v) in enumerate(self.pool[dtype].keys()):
+                traits=self.pool[dtype][(hdim, hdim_v)]
                inners=str()
                for k, trait in enumerate(traits):
                    if_k = 'if' if k == 0 else 'else if'
@@ -306,7 +307,7 @@ class FmhaFwdApiPool:
                                   F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max,
                                   F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype])
                if_j = 'if' if j == 0 else 'else if'
-                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=trait.bn1, F_inner_dispatch=inners)
+                per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_hdim_v=hdim_v, F_inner_dispatch=inners)
            if_i = 'if' if i == 0 else 'else if'
            per_dtypes = per_dtypes + FMHA_FWD_API_PER_DTYPE.format(F_if=if_i, F_dtype=dtype, F_hdim_case=per_hdim_case)
        if not per_dtypes:
@@ -435,18 +436,20 @@ class FmhaFwdKernel:
 def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
    if dtype == 'fp16' or dtype == 'bf16':
        return {
-            '32'  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-        ### '96'  : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '192' : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            (32, 32)  : FmhaFwdTileSize(128, 64,  16, 32,  32,  32,   2, 1, 1,  2, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            (64, 64)  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### (96, 128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+            (128,128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### (160,160) : FmhaFwdTileSize(128, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  1),
+            (192,128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
+        ### (192,192) : FmhaFwdTileSize(128, 128, 32, 192, 32,  192,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  1),
+            (256,256) : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 16,  32, 32, 16,  -1),
        }
    elif dtype == 'fp8' or dtype == 'bf8':
        return {
-            '64'  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '128' : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
-            '256' : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            (64,64 )  : FmhaFwdTileSize(128, 64,  32, 64,  32,  64,   2, 1, 1,  2, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            (128,128) : FmhaFwdTileSize(128, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
+            (256,256) : FmhaFwdTileSize(128, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  32, 32, 32,  32, 32, 32,  -1),
        }
    else:
        return None
@@ -454,7 +457,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
 def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl) -> Tuple[FmhaFwdApiPool, List[FmhaFwdKernel]]:
    # TODO: we don't support tuning yet, so pick up one value for vlayout/pipeline/pad
    #       support this in future
-    def get_pipelines(dtype, hdim) -> List[FmhaFwdPipeline]:
+    def get_pipelines(dtype, hdim, hdim_v) -> List[FmhaFwdPipeline]:
        # this function will populate a list possible pipelines
        # TODO: the order of List matters! the later in this list will be also be checked later
        # TODO: currently for qr pipeline, let 't' padding to appear later!!
@@ -463,7 +466,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
        pipelines = []
        if dtype in ['fp16', 'bf16']:
            for logits, mask, bias, lse, dropout, skip in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"], ["t", "f"], ["t", "f"]):
-                if hdim == 256:
+                if hdim == 256 and hdim_v == 256:
                # if True:
                    pipelines.append(FmhaFwdPipeline('qr', 'row', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
                    pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', logits, bias, lse, dropout, squant, mask, skip))
@@ -507,15 +510,13 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, optdim_list, mask_impl
        if d == None:
            continue
        #for hdim_str, mode, mask, bias, lse in itertools.product(d.keys(), MODE_MAP.keys(), MASK_MAP.keys(), ["t", "f"], ["t", "f"]):
-        for hdim_str, mode in itertools.product(d.keys(), MODE_MAP.keys()):
-            tile = d[hdim_str]
-            hdim = int(hdim_str)
-            for pipeline in get_pipelines(dtype, hdim):
+        for ((hdim, hdim_v), tile), mode in itertools.product(d.items(), MODE_MAP.keys()):
+            for pipeline in get_pipelines(dtype, hdim, hdim_v):
                if mode == "group":
                    if pipeline.F_spad != 't' or pipeline.F_skpad != 't':
                        # in group mode, spad/skpad must be true, since we can't predict if seqlen of current batch need pad or not
                        continue
-                if hdim == 192 and tile.F_bn1 == 128:
+                if (hdim, hdim_v) == (192, 128) or hdim == 160:
                    # NOTE: this is used to speedup deepseek prefill case, we don't gen training
                    if pipeline.F_bias != 'no' or pipeline.F_dropout == 't':
                        continue
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -34,6 +34,7 @@ K0_MAX_SUBMAX_MAP = {
    64 : 64,
    96 : 128,
    128: 128,
+    # 160: 160,
    256: 256
 }

@@ -638,6 +639,7 @@ def get_fmha_fwd_tile_dict_from_dtype(dtype : str) -> Optional[dict]:
            '64'  : FmhaFwdTileSize(64, 64,  32, 64,  32,  64,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
        ### '96'  : FmhaFwdTileSize(64, 128, 32, 128, 32,  96,   4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
            '128' : FmhaFwdTileSize(64, 128, 32, 128, 32,  128,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
+        ### '160' : FmhaFwdTileSize(64, 128, 32, 160, 32,  160,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
            '256' : FmhaFwdTileSize(64, 128, 32, 256, 32,  256,  4, 1, 1,  4, 1, 1,  16, 16, 16,  16, 16, 16,  -1),
        }
    elif dtype == 'fp8' or dtype == 'bf8':
@@ -656,6 +658,7 @@ def get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype : str) -> Optional[d
            '64'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
        ### '96'  : FmhaFwdSplitKVCombineTileSize(32,  -1),
            '128' : FmhaFwdSplitKVCombineTileSize(32,  -1),
+        ### '160' : FmhaFwdSplitKVCombineTileSize(32,  -1),
            '256' : FmhaFwdSplitKVCombineTileSize(32,  -1),
    }
    elif dtype == 'fp8' or dtype == 'bf8':
@@ -683,7 +686,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
        if dtype in ['fp16', 'bf16']:
            for logits, mask, bias, pagedkv in itertools.product(["t", "f"], get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]):
                # TODO: use async pipeline when compiler is more stable
-                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128]:
+                if hdim == 256 or hdim in [32, 64, 128]:         ### [32, 64, 96, 128, 160]:
                # if True:
                    pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
                    pipelines.append(Pipeline('qr', 'col', 'f', 't', 'f', 'f', logits, bias, 't', squant, pagedkv, mask))
--- a/example/ck_tile/01_fmha/fmha_bwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_bwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "fmha_bwd.hpp"
 #include "ck_tile/host.hpp"
@@ -756,22 +756,17 @@ bool run(const ck_tile::ArgParser& arg_parser)

        if(p_drop > 0)
        {
-            p_hp_host_ref.ForEach(
-                [&](auto& self, auto idx) { p_dropped_hp_host_ref(idx) = self(idx); });
+            p_dropped_hp_host_ref = p_hp_host_ref;
            randval_host_ref.ForEach([&](auto& self, auto idx) {
                self(idx) = randval_host(b, idx[0], idx[1] + query_offset, idx[2]);
            });
            ck_tile::reference_batched_dropout(
                p_dropped_hp_host_ref, randval_host_ref, p_undrop_in_uint8_t, rp_undrop);
-            p_dropped_hp_host_ref.ForEach([&](auto& self, auto idx) {
-                p_lp_host_ref(idx) = ck_tile::type_convert<GemmDataType>(self(idx));
-            });
+            p_lp_host_ref = p_dropped_hp_host_ref.template CopyAsType<GemmDataType>();
        }
        else
        {
-            p_hp_host_ref.ForEach([&](auto& self, auto idx) {
-                p_lp_host_ref(idx) = ck_tile::type_convert<GemmDataType>(self(idx));
-            });
+            p_lp_host_ref = p_hp_host_ref.template CopyAsType<GemmDataType>();
        }

        // O = P * V
@@ -854,29 +849,27 @@ bool run(const ck_tile::ArgParser& arg_parser)
        }

        // dS_i_j = P_i_j .* (dP_i_j - dO_i dot O_i)
-        ds_hp_host_ref.ForEach([&](auto& self, auto idx_gmn) {
-            AccDataType do_dot_o = 0;
-            for(int o = 0; o < hdim_v; o++)
-            {
-                auto idx_gmo = idx_gmn;
-                idx_gmo[2]   = o;
-                do_dot_o += ck_tile::type_convert<AccDataType>(do_host_ref(idx_gmo)) *
-                            ck_tile::type_convert<AccDataType>(o_host_refs[wb](idx_gmo));
-            }
-            self(idx_gmn) = ck_tile::type_convert<AccDataType>(
-                p_hp_host_refs[wb](idx_gmn) * (dp_hp_host_ref(idx_gmn) - do_dot_o));
-        });
+        ck_tile::make_ParallelTensorFunctor(
+            [&](auto i0, auto i1, auto i2) {
+                AccDataType do_dot_o = 0;
+                for(int o = 0; o < hdim_v; o++)
+                {
+                    do_dot_o += ck_tile::type_convert<AccDataType>(do_host_ref(i0, i1, o)) *
+                                ck_tile::type_convert<AccDataType>(o_host_refs[wb](i0, i1, o));
+                }
+                ds_hp_host_ref(i0, i1, i2) = ck_tile::type_convert<AccDataType>(
+                    p_hp_host_refs[wb](i0, i1, i2) * (dp_hp_host_ref(i0, i1, i2) - do_dot_o));
+            },
+            ds_hp_host_ref.mDesc.get_lengths()[0],
+            ds_hp_host_ref.mDesc.get_lengths()[1],
+            ds_hp_host_ref.mDesc.get_lengths()[2])(std::thread::hardware_concurrency());

        if(use_dbias)
        {
-            ds_hp_host_ref.ForEach([&](auto& self, auto idx) {
-                dbias_host_ref(idx) = ck_tile::type_convert<BiasGradDataType>(self(idx));
-            });
+            dbias_host_ref = ds_hp_host_ref.template CopyAsType<BiasGradDataType>();
        }

-        ds_hp_host_ref.ForEach([&](auto& self, auto idx) {
-            ds_lp_host_ref(idx) = ck_tile::type_convert<GemmDataType>(self(idx));
-        });
+        ds_lp_host_ref = ds_hp_host_ref.template CopyAsType<GemmDataType>();

        // dV = P_drop^T@dO^T
        // dV = P^T@dO^T w/o dropout
--- a/example/ck_tile/01_fmha/fmha_fwd.cpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "fmha_fwd.hpp"
 #include "ck_tile/host.hpp"
@@ -178,50 +178,30 @@ auto get_elimit<FmhaFwdFp8>(std::string init_method)
    }
 }

-int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int num_n_blocks, int max_splits)
+int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int max_splits)
 {
    // If we have enough to almost fill the SMs, then just use 1 split
    if(batch_nhead_mblocks >= 0.8f * num_SMs)
    {
        return 1;
    }
-    max_splits           = std::min({max_splits, num_SMs, num_n_blocks});
+    max_splits           = std::min({max_splits, num_SMs});
    float max_efficiency = 0.f;
    std::vector<float> efficiency;
    efficiency.reserve(max_splits);
-    auto ceildiv = [](int a, int b) { return (a + b - 1) / b; };
-    // Some splits are not eligible. For example, if we have 64 blocks and choose 11 splits,
-    // we'll have 6 * 10 + 4 blocks. If we choose 12 splits, we'll have 6 * 11 + (-2) blocks
-    // (i.e. it's 11 splits anyway).
-    // So we check if the number of blocks per split is the same as the previous num_splits.
-    auto is_split_eligible = [&ceildiv, &num_n_blocks](int num_splits) {
-        return num_splits == 1 ||
-               ceildiv(num_n_blocks, num_splits) != ceildiv(num_n_blocks, num_splits - 1);
-    };
    for(int num_splits = 1; num_splits <= max_splits; num_splits++)
    {
-        if(!is_split_eligible(num_splits))
+        float n_waves = float(batch_nhead_mblocks * num_splits) / num_SMs;
+        float eff     = n_waves / ceil(n_waves);
+        // printf("num_splits = %d, eff = %f\n", num_splits, eff);
+        if(eff > max_efficiency)
        {
-            efficiency.push_back(0.f);
-        }
-        else
-        {
-            float n_waves = float(batch_nhead_mblocks * num_splits) / num_SMs;
-            float eff     = n_waves / ceil(n_waves);
-            // printf("num_splits = %d, eff = %f\n", num_splits, eff);
-            if(eff > max_efficiency)
-            {
-                max_efficiency = eff;
-            }
-            efficiency.push_back(eff);
+            max_efficiency = eff;
        }
+        efficiency.push_back(eff);
    }
    for(int num_splits = 1; num_splits <= max_splits; num_splits++)
    {
-        if(!is_split_eligible(num_splits))
-        {
-            continue;
-        }
        if(efficiency[num_splits - 1] >= 0.85 * max_efficiency)
        {
            // printf("num_splits chosen = %d\n", num_splits);
@@ -234,6 +214,7 @@ int num_splits_heuristic(int batch_nhead_mblocks, int num_SMs, int num_n_blocks,
 int override_num_splits_if_necessary(
    int batch, int nhead, int max_seqlen_q, int hdim_v, float p_drop, int num_splits)
 {
+    (void)hdim_v;
    int device;
    auto status = hipGetDevice(&device);
    if(status != hipSuccess)
@@ -250,15 +231,13 @@ int override_num_splits_if_necessary(

    // tile size should match the generate.py
    const int kM0 = 64;
-    const int kN1 = hdim_v;

    const int num_m_blocks = ck_tile::integer_divide_ceil(max_seqlen_q, kM0);
-    const int num_n_blocks = ck_tile::integer_divide_ceil(hdim_v, kN1);

    if(num_splits < 1 && p_drop == 0.0f)
    {
        return num_splits_heuristic(
-            batch * nhead * num_m_blocks, props.multiProcessorCount * 2, num_n_blocks, 128);
+            batch * nhead * num_m_blocks, props.multiProcessorCount * 2, 128);
    }

    return num_splits;
@@ -542,8 +521,8 @@ bool run(const ck_tile::ArgParser& arg_parser)
                max_seqlen_k = real_seqlen_k;
            }

-            flop += nhead * (static_cast<std::size_t>(2) * real_seqlen_q * real_seqlen_k * hdim_q +
-                             static_cast<std::size_t>(2) * real_seqlen_q * hdim_v * real_seqlen_k);
+            flop += nhead * (static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_q +
+                             static_cast<std::size_t>(2) * mask.get_unmaskarea() * hdim_v);

            num_byte += nhead * (sizeof(QDataType) * real_seqlen_q * hdim_q +
                                 sizeof(KDataType) * real_seqlen_k * hdim_q +
--- a/example/ck_tile/01_fmha/fmha_fwd.hpp
+++ b/example/ck_tile/01_fmha/fmha_fwd.hpp
@@ -715,102 +715,102 @@ auto fmha_batch_prefill_create_kargs_and_grids(fmha_batch_prefill_args args)
        // create group mode kernel arguments
        if constexpr(FmhaKernel::kIsGroupMode)
        {
-            return FmhaKernel::MakeKargsImpl(args.q_ptr,
-                                             args.k_ptr,
-                                             args.v_ptr,
-                                             args.bias_ptr,
-                                             args.rand_val_ptr,
-                                             args.lse_ptr,
-                                             args.o_ptr,
-                                             args.seqstart_q_ptr,
-                                             args.hdim_q,
-                                             args.hdim_v,
-                                             args.nhead_q,
-                                             args.nhead_q / args.nhead_k,
-                                             args.num_total_pages,
-                                             args.kv_indptr,
-                                             args.kv_page_indices,
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.rand_val_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqstart_q_ptr,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.num_total_pages,
+                                         args.kv_indptr,
+                                         args.kv_page_indices,
 #if 0 // we assume page_block_size=1 for now
                                         args.kv_last_page_lens,
                                         args.page_block_size,
 #endif
-                                             args.scale_s,
-                                             args.scale_p,
-                                             args.scale_o,
-                                             args.logits_soft_cap,
-                                             args.stride_q,
-                                             args.stride_k,
-                                             args.stride_v,
-                                             args.stride_bias,
-                                             args.stride_randval,
-                                             args.stride_o,
-                                             args.nhead_stride_q,
-                                             args.nhead_stride_k,
-                                             args.nhead_stride_v,
-                                             args.nhead_stride_bias,
-                                             args.nhead_stride_randval,
-                                             args.nhead_stride_lse,
-                                             args.nhead_stride_o,
-                                             args.batch_stride_k,
-                                             args.batch_stride_v,
-                                             args.window_size_left,
-                                             args.window_size_right,
-                                             args.mask_type,
-                                             args.p_drop,
-                                             args.s_randval,
-                                             args.drop_seed_offset);
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_randval,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_randval,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type,
+                                         args.p_drop,
+                                         args.s_randval,
+                                         args.drop_seed_offset);
        }
        else
        { // create batch mode kernel arguments
-            return FmhaKernel::MakeKargsImpl(args.q_ptr,
-                                             args.k_ptr,
-                                             args.v_ptr,
-                                             args.bias_ptr,
-                                             args.rand_val_ptr,
-                                             args.lse_ptr,
-                                             args.o_ptr,
-                                             args.seqlen_q,
-                                             args.hdim_q,
-                                             args.hdim_v,
-                                             args.nhead_q,
-                                             args.nhead_q / args.nhead_k,
-                                             args.num_total_pages,
-                                             args.kv_indptr,
-                                             args.kv_page_indices,
+            return FmhaKernel::MakeKargs(args.q_ptr,
+                                         args.k_ptr,
+                                         args.v_ptr,
+                                         args.bias_ptr,
+                                         args.rand_val_ptr,
+                                         args.lse_ptr,
+                                         args.o_ptr,
+                                         args.seqlen_q,
+                                         args.hdim_q,
+                                         args.hdim_v,
+                                         args.nhead_q,
+                                         args.nhead_q / args.nhead_k,
+                                         args.num_total_pages,
+                                         args.kv_indptr,
+                                         args.kv_page_indices,
 #if 0 // we assume page_block_size=1 for now
                                         args.kv_last_page_lens,
                                         args.page_block_size,
 #endif
-                                             args.scale_s,
-                                             args.scale_p,
-                                             args.scale_o,
-                                             args.logits_soft_cap,
-                                             args.stride_q,
-                                             args.stride_k,
-                                             args.stride_v,
-                                             args.stride_bias,
-                                             args.stride_randval,
-                                             args.stride_o,
-                                             args.nhead_stride_q,
-                                             args.nhead_stride_k,
-                                             args.nhead_stride_v,
-                                             args.nhead_stride_bias,
-                                             args.nhead_stride_randval,
-                                             args.nhead_stride_lse,
-                                             args.nhead_stride_o,
-                                             args.batch_stride_q,
-                                             args.batch_stride_k,
-                                             args.batch_stride_v,
-                                             args.batch_stride_bias,
-                                             args.batch_stride_randval,
-                                             args.batch_stride_lse,
-                                             args.batch_stride_o,
-                                             args.window_size_left,
-                                             args.window_size_right,
-                                             args.mask_type,
-                                             args.p_drop,
-                                             args.s_randval,
-                                             args.drop_seed_offset);
+                                         args.scale_s,
+                                         args.scale_p,
+                                         args.scale_o,
+                                         args.logits_soft_cap,
+                                         args.stride_q,
+                                         args.stride_k,
+                                         args.stride_v,
+                                         args.stride_bias,
+                                         args.stride_randval,
+                                         args.stride_o,
+                                         args.nhead_stride_q,
+                                         args.nhead_stride_k,
+                                         args.nhead_stride_v,
+                                         args.nhead_stride_bias,
+                                         args.nhead_stride_randval,
+                                         args.nhead_stride_lse,
+                                         args.nhead_stride_o,
+                                         args.batch_stride_q,
+                                         args.batch_stride_k,
+                                         args.batch_stride_v,
+                                         args.batch_stride_bias,
+                                         args.batch_stride_randval,
+                                         args.batch_stride_lse,
+                                         args.batch_stride_o,
+                                         args.window_size_left,
+                                         args.window_size_right,
+                                         args.mask_type,
+                                         args.p_drop,
+                                         args.s_randval,
+                                         args.drop_seed_offset);
        }
    }();

--- a/example/ck_tile/01_fmha/mask.hpp
+++ b/example/ck_tile/01_fmha/mask.hpp
@@ -21,6 +21,8 @@ enum class mask_enum
 struct mask_info
 {
    mask_enum type;
+    ck_tile::index_t seqlen_q;
+    ck_tile::index_t seqlen_k;
    ck_tile::index_t y, x;
    ck_tile::index_t left, right; // FA style SWA left/right

@@ -42,6 +44,8 @@ struct mask_info
        ck_tile::index_t x_total = seqlen_k;
        ck_tile::index_t y_total = seqlen_q;
        mask_info tmp;
+        tmp.seqlen_q = seqlen_q;
+        tmp.seqlen_k = seqlen_k;
        auto found_0 = str.find(':');
        if(found_0 != std::string::npos)
        {
@@ -148,7 +152,22 @@ struct mask_info
        }
        return tmp;
    }
-
+    ck_tile::index_t get_unmaskarea() const
+    {
+        if(type == mask_enum::no_mask)
+            return seqlen_q * seqlen_k;
+        ck_tile::index_t area = 0;
+        for(ck_tile::index_t i_y = 0; i_y < seqlen_q; ++i_y)
+        {
+            ck_tile::index_t x_start = std::max(-y + i_y + 1, static_cast<ck_tile::index_t>(0));
+            ck_tile::index_t x_end   = std::min(i_y + x, seqlen_k);
+            if(x_end > x_start)
+            {
+                area += (x_end - x_start);
+            }
+        }
+        return area;
+    }
    friend std::ostream& operator<<(std::ostream& os, const mask_info& mi)
    {
        mi.serialize(os);
--- a/example/ck_tile/02_layernorm2d/generate.py
+++ b/example/ck_tile/02_layernorm2d/generate.py
@@ -75,22 +75,22 @@ struct layernorm2d_fwd_traits_
    using SmoothScaleDataType = ck_tile::remove_cvref_t<SmoothScaleDataType_>;
    using YScaleDataType = ck_tile::remove_cvref_t<YScaleDataType_>;

-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();

    // num of warps along m
    static constexpr ck_tile::index_t BlockWarps_M = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
        }
        else
        {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
        }
    }();

@@ -98,13 +98,13 @@ struct layernorm2d_fwd_traits_
    static constexpr ck_tile::index_t BlockWarps_N = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
            return 1;
        }
        else
        {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
        }
    }();

--- a/example/ck_tile/03_gemm/README.md
+++ b/example/ck_tile/03_gemm/README.md
@@ -30,7 +30,7 @@ args:
   -stride_c    Tensor C stride (default:0)
          -v    0. No validation, 1. Validation on CPU, 2. Validation on GPU (default:2)
          -e    Absolute error tolerance (default:1e-5)
-       -prec    data type. fp16/bf16/fp8/bf8 (default:fp16)
+       -prec    data type. fp16/bf16/fp8/bf8/int8 (default:fp16)
     -warmup    number of iterations before benchmark the kernel (default:10)
     -repeat    number of iterations to benchmark the kernel (default:100)
      -timer    gpu:gpu timer, cpu:cpu timer (default:gpu)
--- a/example/ck_tile/03_gemm/gemm_basic.cpp
+++ b/example/ck_tile/03_gemm/gemm_basic.cpp
@@ -212,6 +212,11 @@ int run_gemm_example(int argc, char* argv[])
        return run_gemm_example_prec_type<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>(
            a_layout, b_layout, argc, argv);
    }
+    else if(data_type == "i8")
+    {
+        return run_gemm_example_prec_type<ck_tile::int8_t, ck_tile::int8_t, int32_t>(
+            a_layout, b_layout, argc, argv);
+    }
    else if(data_type == "pk_int4_t")
    {
        // TODO: Add support for bhalf_t ADataType
--- a/example/ck_tile/03_gemm/gemm_utils.hpp
+++ b/example/ck_tile/03_gemm/gemm_utils.hpp
@@ -1,4 +1,3 @@
-
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

@@ -16,6 +15,25 @@
 #define CK_TILE_PIPELINE_COMPUTE_V4 3
 #define CK_TILE_PIPELINE_COMPUTE_V5 4

+// temporary workaround to get k_warp_tile based on PrecType and gfx950 or not
+template <typename PrecType, ck_tile::index_t M_Warp_Tile>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if defined(__gfx950__)
+    constexpr bool is_8bit_float =
+        std::is_same_v<PrecType, ck_tile::fp8_t> || std::is_same_v<PrecType, ck_tile::bf8_t>;
+    if constexpr(M_Warp_Tile == 32)
+        return is_8bit_float ? 64 : 16;
+    else
+        return is_8bit_float ? 128 : 32;
+#else
+    if constexpr(M_Warp_Tile == 32)
+        return 16;
+    else
+        return 32;
+#endif
+}
+
 struct GemmConfigBase
 {
    static constexpr bool kPadM = false;
@@ -90,7 +108,7 @@ struct GemmConfigComputeV3 : public GemmConfigBase

    static constexpr ck_tile::index_t M_Warp_Tile = 32;
    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();

    static constexpr bool DoubleSmemBuffer     = false;
    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
@@ -109,7 +127,7 @@ struct GemmConfigComputeV3_1 : public GemmConfigBase

    static constexpr ck_tile::index_t M_Warp_Tile = 32;
    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();

    static constexpr bool DoubleSmemBuffer     = false;
    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
@@ -128,7 +146,7 @@ struct GemmConfigComputeV3_2 : public GemmConfigBase

    static constexpr ck_tile::index_t M_Warp_Tile = 16;
    static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 32 : 128;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();

    static constexpr bool DoubleSmemBuffer     = false;
    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
@@ -151,7 +169,7 @@ struct GemmConfigComputeV4 : public GemmConfigBase

    static constexpr ck_tile::index_t M_Warp_Tile = 32;
    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();

    static constexpr bool DoubleSmemBuffer     = true;
    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
@@ -170,7 +188,7 @@ struct GemmConfigComputeV4_1 : public GemmConfigBase

    static constexpr ck_tile::index_t M_Warp_Tile = 32;
    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();

    static constexpr bool DoubleSmemBuffer     = true;
    static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V4;
@@ -189,7 +207,7 @@ struct GemmConfigComputeV5 : public GemmConfigBase

    static constexpr ck_tile::index_t M_Warp_Tile = 32;
    static constexpr ck_tile::index_t N_Warp_Tile = 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 16 : 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();

    static constexpr bool DoubleSmemBuffer               = false;
    static constexpr ck_tile::index_t Pipeline           = CK_TILE_PIPELINE_COMPUTE_V5;
@@ -245,6 +263,15 @@ struct GemmTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
    using CDataType   = ck_tile::half_t;
 };

+template <>
+struct GemmTypeConfig<ck_tile::int8_t, ck_tile::int8_t, int32_t>
+{
+    using ADataType   = ck_tile::int8_t;
+    using BDataType   = ck_tile::int8_t;
+    using AccDataType = int32_t;
+    using CDataType   = int32_t;
+};
+
 template <typename T>
 struct DataTypeTraits;

@@ -260,6 +287,12 @@ struct DataTypeTraits<double>
    static constexpr const char* name = "fp64";
 };

+template <>
+struct DataTypeTraits<int32_t>
+{
+    static constexpr const char* name = "int32";
+};
+
 template <>
 struct DataTypeTraits<ck_tile::half_t>
 {
@@ -290,6 +323,12 @@ struct DataTypeTraits<ck_tile::pk_int4_t>
    static constexpr const char* name = "pk_int4_t";
 };

+template <>
+struct DataTypeTraits<ck_tile::int8_t>
+{
+    static constexpr const char* name = "int8";
+};
+
 template <ck_tile::index_t PipelineId>
 struct PipelineTypeTraits;

--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -297,8 +297,8 @@ int run_gemm_example_with_layouts(int argc,

    if(init_method == 0)
    {
-        ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
+        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
    }
    else if(init_method == 1)
    {
@@ -415,29 +415,19 @@ int run_gemm_example_with_layouts(int argc,
            // Restore input for B for gpu reference
            b_k_n_dev_buf.ToDevice(b_k_n.data());
        }
+
+        // memory on host to store gpu reference result
        ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
            ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
+        // memory on device to store gpu reference result
        ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
+
        c_m_n_gpu_ref.SetZero();
        c_m_n_gpu_buf_ref.SetZero();

-        ADataType* d_A;
-        BDataType* d_B;
-        CDataType* d_C;
-
-        ck_tile::hip_check_error(hipMalloc(&d_A, a_m_k.get_element_space_size_in_bytes()));
-        ck_tile::hip_check_error(hipMalloc(&d_B, b_k_n.get_element_space_size_in_bytes()));
-        ck_tile::hip_check_error(
-            hipMalloc(&d_C, c_m_n_dev_result.get_element_space_size_in_bytes()));
-
-        ck_tile::hip_check_error(hipMemcpy(d_A,
-                                           a_m_k_dev_buf.GetDeviceBuffer(),
-                                           a_m_k.get_element_space_size_in_bytes(),
-                                           hipMemcpyHostToDevice));
-        ck_tile::hip_check_error(hipMemcpy(d_B,
-                                           b_k_n_dev_buf.GetDeviceBuffer(),
-                                           b_k_n.get_element_space_size_in_bytes(),
-                                           hipMemcpyHostToDevice));
+        ADataType* d_A = static_cast<ADataType*>(a_m_k_dev_buf.GetDeviceBuffer());
+        BDataType* d_B = static_cast<BDataType*>(b_k_n_dev_buf.GetDeviceBuffer());
+        CDataType* d_C = static_cast<CDataType*>(c_m_n_gpu_buf_ref.GetDeviceBuffer());

        ck_tile::reference_gemm_gpu<ADataType,
                                    BDataType,
@@ -447,16 +437,8 @@ int run_gemm_example_with_layouts(int argc,
                                    BLayout,
                                    CLayout>(d_A, d_B, d_C, M, N, K, stride_A, stride_B, stride_C);

-        ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
-                                           d_C,
-                                           c_m_n_dev_result.get_element_space_size_in_bytes(),
-                                           hipMemcpyDeviceToHost));
-
-        ck_tile::hip_check_error(hipFree(d_A));
-        ck_tile::hip_check_error(hipFree(d_B));
-        ck_tile::hip_check_error(hipFree(d_C));
-
        c_m_n_gpu_buf_ref.FromDevice(c_m_n_gpu_ref.data());
+
        const float max_accumulated_value =
            *std::max_element(c_m_n_gpu_ref.mData.begin(), c_m_n_gpu_ref.mData.end());
        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -299,6 +299,13 @@ int run_gemm_example(int argc, char* argv[])
                                          ck_tile::bf8_t,
                                          ck_tile::half_t>(a_layout, b_layout, argc, argv);
    }
+    else if(data_type == "int8")
+    {
+        return run_gemm_example_prec_type<GemmConfig<ck_tile::int8_t>,
+                                          ck_tile::int8_t,
+                                          ck_tile::int8_t,
+                                          ck_tile::int32_t>(a_layout, b_layout, argc, argv);
+    }
    else if(data_type == "pk_int4_t")
    {
        // TODO: Add support for bhalf_t ADataType
--- a/example/ck_tile/05_reduce/reduce.hpp
+++ b/example/ck_tile/05_reduce/reduce.hpp
@@ -35,7 +35,7 @@ struct Reduce2dShape
    static constexpr index_t Repeat_N = Block_N / (WarpPerBlock_N * Warp_N);

    static constexpr index_t BlockSize =
-        WarpSize * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
+        ck_tile::get_warp_size() * reduce_on_sequence(BlockWarps{}, multiplies{}, number<1>{});
 };

 template <typename XDataType_,
--- a/example/ck_tile/10_rmsnorm2d/generate.py
+++ b/example/ck_tile/10_rmsnorm2d/generate.py
@@ -74,22 +74,22 @@ struct rmsnorm2d_fwd_traits_
    using YScaleDataType      = ck_tile::remove_cvref_t<YScaleDataType_>;
    using UnquantYDataType    = ck_tile::remove_cvref_t<UnquantYDataType_>;

-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();

    // num of warps along m
    static constexpr ck_tile::index_t BlockWarps_M = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
        }
        else
        {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
        }
    }();

@@ -97,13 +97,13 @@ struct rmsnorm2d_fwd_traits_
    static constexpr ck_tile::index_t BlockWarps_N = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
            return 1;
        }
        else
        {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
        }
    }();

@@ -712,4 +712,4 @@ if __name__ == "__main__":
    if args.list_blobs:
        list_blobs(args)
    else:
-        gen_blobs(args)
+        gen_blobs(args)
--- a/example/ck_tile/12_smoothquant/smoothquant.hpp
+++ b/example/ck_tile/12_smoothquant/smoothquant.hpp
@@ -49,22 +49,22 @@ struct smoothquant_traits_
 {
    using DataType = ck_tile::remove_cvref_t<DataType_>;

-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();

    // num of warps along m
    static constexpr ck_tile::index_t BlockWarps_M = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
        }
        else
        {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
        }
    }();

@@ -72,13 +72,13 @@ struct smoothquant_traits_
    static constexpr ck_tile::index_t BlockWarps_N = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
            return 1;
        }
        else
        {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
        }
    }();

--- a/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
+++ b/example/ck_tile/14_moe_smoothquant/moe_smoothquant.hpp
@@ -38,22 +38,22 @@ struct moe_smoothquant_traits_
    using InputType  = ck_tile::remove_cvref_t<InputType_>;
    using OutputType = ck_tile::remove_cvref_t<OutputType_>;

-    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= WarpSize;
-    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % WarpSize == 0);
+    static constexpr bool is_warp_per_row = ThreadPerBlock_N_ <= ck_tile::get_warp_size();
+    static_assert((ThreadPerBlock_M_ * ThreadPerBlock_N_) % ck_tile::get_warp_size() == 0);
    static constexpr ck_tile::index_t total_warps =
-        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / WarpSize;
+        (ThreadPerBlock_M_ * ThreadPerBlock_N_) / ck_tile::get_warp_size();

    // num of warps along m
    static constexpr ck_tile::index_t BlockWarps_M = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
-            return total_warps * (WarpSize / ThreadPerBlock_N_);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
+            return total_warps * (ck_tile::get_warp_size() / ThreadPerBlock_N_);
        }
        else
        {
-            // static_assert(WarpSize % ThreadPerBlock_M_ == 0);
-            return total_warps / (ThreadPerBlock_N_ / WarpSize);
+            // static_assert(ck_tile::get_warp_size() % ThreadPerBlock_M_ == 0);
+            return total_warps / (ThreadPerBlock_N_ / ck_tile::get_warp_size());
        }
    }();

@@ -61,13 +61,13 @@ struct moe_smoothquant_traits_
    static constexpr ck_tile::index_t BlockWarps_N = []() {
        if constexpr(is_warp_per_row)
        {
-            static_assert(WarpSize % ThreadPerBlock_N_ == 0);
+            static_assert(ck_tile::get_warp_size() % ThreadPerBlock_N_ == 0);
            return 1;
        }
        else
        {
-            static_assert(ThreadPerBlock_N_ % WarpSize == 0);
-            return ThreadPerBlock_N_ / WarpSize;
+            static_assert(ThreadPerBlock_N_ % ck_tile::get_warp_size() == 0);
+            return ThreadPerBlock_N_ / ck_tile::get_warp_size();
        }
    }();

--- a/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
+++ b/example/ck_tile/17_grouped_gemm/grouped_gemm_tileloop.cpp
@@ -116,9 +116,12 @@ float grouped_gemm_tileloop(const ck_tile::stream_config& s,
        using GemmEpilogue = ck_tile::CShuffleEpilogue<
            ck_tile::CShuffleEpilogueProblem<ADataType,
                                             BDataType,
+                                             ck_tile::tuple<>,
                                             AccDataType,
                                             CDataType,
+                                             ck_tile::tuple<>,
                                             CLayout,
+                                             ck_tile::element_wise::PassThrough,
                                             GemmPipelineProblem::kBlockSize,
                                             TilePartitioner::MPerBlock,
                                             TilePartitioner::NPerBlock,
--- a/example/ck_tile/18_flatmm/CMakeLists.txt
+++ b/example/ck_tile/18_flatmm/CMakeLists.txt
@@ -3,6 +3,4 @@ add_executable(tile_example_flatmm_basic EXCLUDE_FROM_ALL flatmm_basic.cpp)
 set(EXAMPLE_FLATMM_COMPILE_OPTIONS)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-undefined-func-template -Wno-float-equal)
 # list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -Wno-unused-variable -Wno-unused-parameter)
-list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_16x16x32=1 -Wno-unused-local-typedef)
-#list(APPEND EXAMPLE_FLATMM_COMPILE_OPTIONS -DUSING_MFMA_32x32x16=1 -Wno-unused-local-typedef)
 target_compile_options(tile_example_flatmm_basic PRIVATE ${EXAMPLE_FLATMM_COMPILE_OPTIONS})
--- a/example/ck_tile/18_flatmm/flatmm_basic.cpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.cpp
@@ -17,12 +17,12 @@ template <typename ADataType,
          typename BDataType,
          typename AccDataType,
          typename CDataType,
+          typename FlatmmConfig,
          typename ALayout,
          typename BLayout,
          typename CLayout>
 float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_config& s)
 {
-    using FlatmmConfig       = FlatmmConfig<ADataType>;
    using CodegenFlatmmShape = ck_tile::TileFlatmmShape<
        ck_tile::sequence<FlatmmConfig::M_Tile, FlatmmConfig::N_Tile, FlatmmConfig::K_Tile>,
        ck_tile::sequence<FlatmmConfig::M_Warp, FlatmmConfig::N_Warp, FlatmmConfig::K_Warp>,
@@ -32,18 +32,20 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con

    using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenFlatmmShape>;

-    using CodegenGemmTraits      = ck_tile::TileGemmTraits<FlatmmConfig::kPadM,
+    using CodegenGemmTraits = ck_tile::TileGemmTraits<FlatmmConfig::kPadM,
                                                      FlatmmConfig::kPadN,
                                                      FlatmmConfig::kPadK,
                                                      ALayout,
                                                      BLayout,
                                                      CLayout>;
+
    using CodegenPipelineProblem = ck_tile::GemmPipelineProblem<ADataType,
                                                                BDataType,
                                                                AccDataType,
                                                                CodegenFlatmmShape,
                                                                CodegenGemmTraits>;
-    const auto Run               = [&](const auto memory_operation_) {
+
+    const auto Run = [&](const auto memory_operation_) {
        constexpr auto memory_operation = memory_operation_.value;

        using GemmEpilogue = ck_tile::CShuffleEpilogue<
@@ -151,6 +153,7 @@ float flatmm_calc(const ck_tile::FlatmmHostArgs& args, const ck_tile::stream_con
    }
 }

+template <template <typename PreType> typename FlatmmConfig>
 int run_flatmm_example(int argc, char* argv[])
 {
    auto [result, arg_parser] = create_args(argc, argv);
@@ -163,24 +166,27 @@ int run_flatmm_example(int argc, char* argv[])
    std::string data_type = arg_parser.get_str("prec");
    std::string a_layout  = arg_parser.get_str("a_layout");
    std::string b_layout  = arg_parser.get_str("b_layout");
-
    if(a_layout == "R" && b_layout == "C")
    {
        if(data_type == "fp16")
        {
-            run_flatmm_example_with_layouts<ck_tile::half_t>(argc, argv, Row{}, Col{}, Row{});
+            run_flatmm_example_with_layouts<ck_tile::half_t, FlatmmConfig<ck_tile::half_t>>(
+                argc, argv, Row{}, Col{}, Row{});
        }
        else if(data_type == "bf16")
        {
-            run_flatmm_example_with_layouts<ck_tile::bf16_t>(argc, argv, Row{}, Col{}, Row{});
+            run_flatmm_example_with_layouts<ck_tile::bf16_t, FlatmmConfig<ck_tile::bf16_t>>(
+                argc, argv, Row{}, Col{}, Row{});
        }
        else if(data_type == "fp8")
        {
-            run_flatmm_example_with_layouts<ck_tile::fp8_t>(argc, argv, Row{}, Col{}, Row{});
+            run_flatmm_example_with_layouts<ck_tile::fp8_t, FlatmmConfig<ck_tile::fp8_t>>(
+                argc, argv, Row{}, Col{}, Row{});
        }
        else if(data_type == "bf8")
        {
-            run_flatmm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Row{}, Col{}, Row{});
+            run_flatmm_example_with_layouts<ck_tile::bf8_t, FlatmmConfig<ck_tile::bf8_t>>(
+                argc, argv, Row{}, Col{}, Row{});
        }
        else
        {
@@ -196,9 +202,29 @@ int run_flatmm_example(int argc, char* argv[])

 int main(int argc, char* argv[])
 {
+    auto [result, arg_parser] = create_args(argc, argv);
+    if(!result)
+        return EXIT_FAILURE;
+
    try
    {
-        return !run_flatmm_example(argc, argv);
+        int warp_tile = arg_parser.get_int("warp_tile");
+        if(warp_tile == 0)
+        {
+            return !run_flatmm_example<FlatmmConfig16>(argc, argv);
+        }
+        else if(warp_tile == 1)
+        {
+            return !run_flatmm_example<FlatmmConfig32>(argc, argv);
+        }
+        else if(warp_tile == 2)
+        {
+            return !run_flatmm_example<FlatmmConfig16_950>(argc, argv);
+        }
+        else
+        {
+            return !run_flatmm_example<FlatmmConfig32_950>(argc, argv);
+        }
    }
    catch(const std::runtime_error& e)
    {
--- a/example/ck_tile/18_flatmm/flatmm_basic.hpp
+++ b/example/ck_tile/18_flatmm/flatmm_basic.hpp
@@ -31,7 +31,63 @@
 #error "unsupported CK_TILE_PIPELINE_DEFAULT value"
 #endif

-template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
+// GEMM config with 32x132 warp tile
+template <typename DataType>
+struct FlatmmConfig32
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(DataType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 32;
+    static constexpr ck_tile::index_t N_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 16 : 32;
+
+    static constexpr bool kPadM      = false;
+    static constexpr bool kPadN      = false;
+    static constexpr bool kPadK      = false;
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename DataType>
+struct FlatmmConfig32_950 : public FlatmmConfig32<DataType>
+{
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 16 : 64;
+};
+
+// GEMM config with 16x16 warp tile
+template <typename DataType>
+struct FlatmmConfig16
+{
+    static constexpr ck_tile::index_t M_Tile = 128;
+    static constexpr ck_tile::index_t N_Tile = 128;
+    static constexpr ck_tile::index_t K_Tile = 128 / sizeof(DataType);
+
+    static constexpr ck_tile::index_t M_Warp = 1;
+    static constexpr ck_tile::index_t N_Warp = 4;
+    static constexpr ck_tile::index_t K_Warp = 1;
+
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 32 : 64;
+
+    static constexpr bool kPadM      = false;
+    static constexpr bool kPadN      = false;
+    static constexpr bool kPadK      = false;
+    static constexpr int kBlockPerCu = 2;
+};
+
+template <typename DataType>
+struct FlatmmConfig16_950 : public FlatmmConfig16<DataType>
+{
+    static constexpr ck_tile::index_t K_Warp_Tile = sizeof(DataType) == 2 ? 32 : 128;
+};
+
+template <typename ADataType>
 struct GemmBasicTypeConfig;

 template <>
@@ -103,47 +159,10 @@ struct DataTypeTraits<ck_tile::half_t>
    static constexpr const char* name = "fp16";
 };

-template <typename T>
-struct is_8bit_type
-    : std::bool_constant<std::is_same_v<T, ck_tile::fp8_t> || std::is_same_v<T, ck_tile::bf8_t>>
+template <>
+struct DataTypeTraits<ck_tile::bf16_t>
 {
-};
-
-template <typename ADataType>
-struct FlatmmConfig
-{
-#if defined(USING_MFMA_16x16x32)
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 128;
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 4;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 16 : 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 64 : 16;
-
-#elif defined(USING_MFMA_32x32x16)
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 256;
-    static constexpr ck_tile::index_t K_Tile = 128;
-
-    static constexpr ck_tile::index_t M_Warp = 1;
-    static constexpr ck_tile::index_t N_Warp = 8;
-    static constexpr ck_tile::index_t K_Warp = 1;
-
-    static constexpr ck_tile::index_t M_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
-    static constexpr ck_tile::index_t N_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 32;
-    static constexpr ck_tile::index_t K_Warp_Tile = is_8bit_type<ADataType>::value ? 32 : 16;
-#endif
-    // The kPadM, kPadN, kPadK & kBlockPerCu should also come from the Codegen part.
-    static constexpr bool kPadM = false;
-    static constexpr bool kPadN = false;
-    static constexpr bool kPadK = false;
-
-    static constexpr int kBlockPerCu = 2;
+    static constexpr const char* name = "bf16";
 };

 auto create_args(int argc, char* argv[])
@@ -163,8 +182,11 @@ auto create_args(int argc, char* argv[])
        .insert("warmup", "50", "number of iterations before benchmark the kernel")
        .insert("repeat", "100", "number of iterations to benchmark the kernel")
        .insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
-        .insert("split_k", "1", "splitK value");
-
+        .insert("split_k", "1", "splitK value")
+        .insert("init", "0", "0:random, 1:linear, 2:constant(1)")
+        .insert("warp_tile",
+                "0",
+                "0: 16x16, 1: 32x32, 2: 16x16x128 (950 only), 3: 32x32x64 (950 only)");
    bool result = arg_parser.parse(argc, argv);
    return std::make_tuple(result, arg_parser);
 }
@@ -174,6 +196,7 @@ template <typename ADataType,
          typename BDataType,
          typename AccDataType,
          typename CDataType,
+          typename FlatmmConfig,
          typename ALayout,
          typename BLayout,
          typename CLayout>
--- a/example/ck_tile/18_flatmm/run_flatmm_example.inc
+++ b/example/ck_tile/18_flatmm/run_flatmm_example.inc
@@ -73,6 +73,7 @@ template <typename ADataType,
          typename BDataType,
          typename AccDataType,
          typename CDataType,
+          typename FlatmmConfig,
          typename ALayout,
          typename BLayout,
          typename CLayout>
@@ -102,9 +103,15 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
    args.stride_B = stride_B;
    args.stride_C = stride_C;

-    float ave_time =
-        flatmm_calc<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-            args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});
+    float ave_time = flatmm_calc<ADataType,
+                                 BDataType,
+                                 AccDataType,
+                                 CDataType,
+                                 FlatmmConfig,
+                                 ALayout,
+                                 BLayout,
+                                 CLayout>(
+        args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat, true, true, 50});

    std::size_t flop = std::size_t(2) * M * N * K;
    std::size_t num_byte =
@@ -120,7 +127,11 @@ float invoke_flatmm(ck_tile::DeviceMem& a_dev_buf,
    return ave_time;
 }

-template <typename PrecType, typename ALayout, typename BLayout, typename CLayout>
+template <typename PrecType,
+          typename FlatmmConfig,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 int run_flatmm_example_with_layouts(int argc,
                                    char* argv[],
                                    const ALayout a_layout                  = ALayout{},
@@ -131,11 +142,10 @@ int run_flatmm_example_with_layouts(int argc,
    if(!result)
        return -1;

-    using ADataType    = typename GemmBasicTypeConfig<PrecType>::ADataType;
-    using BDataType    = typename GemmBasicTypeConfig<PrecType>::BDataType;
-    using CDataType    = typename GemmBasicTypeConfig<PrecType>::CDataType;
-    using AccDataType  = typename GemmBasicTypeConfig<PrecType>::AccDataType;
-    using FlatmmConfig = FlatmmConfig<ADataType>;
+    using ADataType   = typename GemmBasicTypeConfig<PrecType>::ADataType;
+    using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
+    using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
+    using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;

    ck_tile::index_t M = arg_parser.get_int("m");
    ck_tile::index_t N = arg_parser.get_int("n");
@@ -145,10 +155,10 @@ int run_flatmm_example_with_layouts(int argc,
    ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
    ck_tile::index_t stride_C = arg_parser.get_int("stride_c");

-    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
-
-    int n_warmup = arg_parser.get_int("warmup");
-    int n_repeat = arg_parser.get_int("repeat");
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");

    stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
    stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -162,8 +172,26 @@ int run_flatmm_example_with_layouts(int argc,
        ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));

    // TODO: add different init types
-    ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
-    ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
+    if(init_method == 0)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{-.5f, .5f}(a_host);
+        ck_tile::FillUniformDistribution<BDataType>{-.5f, .5f}(b_origin_host);
+    }
+    else if(init_method == 1)
+    {
+        ck_tile::FillMonotonicSeq<ADataType>{}(a_host);
+        ck_tile::FillMonotonicSeq<BDataType>{}(b_origin_host);
+    }
+    else if(init_method == 2)
+    {
+        ck_tile::FillUniformDistribution<ADataType>{1.f, 1.f}(a_host);
+        ck_tile::FillUniformDistribution<BDataType>{1.f, 1.f}(b_origin_host);
+    }
+    else
+    {
+        a_host.SetZero();
+        b_origin_host.SetZero();
+    }

    ck_tile::DeviceMem a_dev_buf(a_host.get_element_space_size_in_bytes());
    ck_tile::DeviceMem c_dev_buf(c_rslt_host.get_element_space_size_in_bytes());
@@ -173,23 +201,28 @@ int run_flatmm_example_with_layouts(int argc,

    // do pre-shuffle
    ck_tile::HostTensor<BDataType> b_shuffle_host = shuffle_b<FlatmmConfig>(b_origin_host);
-
    ck_tile::DeviceMem b_shuffle_dev_buf(b_shuffle_host.get_element_space_size_in_bytes());
    b_shuffle_dev_buf.ToDevice(b_shuffle_host.data());

-    invoke_flatmm<ADataType, BDataType, AccDataType, CDataType, ALayout, BLayout, CLayout>(
-        a_dev_buf,
-        b_shuffle_dev_buf,
-        c_dev_buf,
-        M,
-        N,
-        K,
-        stride_A,
-        stride_B,
-        stride_C,
-        kbatch,
-        n_warmup,
-        n_repeat);
+    invoke_flatmm<ADataType,
+                  BDataType,
+                  AccDataType,
+                  CDataType,
+                  FlatmmConfig,
+                  ALayout,
+                  BLayout,
+                  CLayout>(a_dev_buf,
+                           b_shuffle_dev_buf,
+                           c_dev_buf,
+                           M,
+                           N,
+                           K,
+                           stride_A,
+                           stride_B,
+                           stride_C,
+                           kbatch,
+                           n_warmup,
+                           n_repeat);

    c_dev_buf.FromDevice(c_rslt_host.data());
    bool pass = true;
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -244,12 +244,6 @@
 // workaround: compiler issue on gfx908
 #define CK_WORKAROUND_SWDEV_388832 1

-// workaround: compiler issue on gfx950
-#define CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION 1
-
-// workaround: compiler issue on gfx950
-#define CK_TEMP_DISABLE_FP4_TESTS 1
-
 // workaround: compiler issue on gfx950
 #define CK_WORKAROUND_FP16_TO_FP8_CONVERSION 1

--- a/include/ck/library/utility/host_tensor.hpp
+++ b/include/ck/library/utility/host_tensor.hpp
@@ -167,7 +167,7 @@ struct HostTensorDescriptor
        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
    }

-    std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
+    std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
    {
        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
    }
@@ -600,12 +600,12 @@ struct Tensor
                     ck::packed_size_v<ck::remove_cvref_t<T>>];
    }

-    T& operator()(std::vector<std::size_t> idx)
+    T& operator()(const std::vector<std::size_t>& idx)
    {
        return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
    }

-    const T& operator()(std::vector<std::size_t> idx) const
+    const T& operator()(const std::vector<std::size_t>& idx) const
    {
        return mData[mDesc.GetOffsetFromMultiIndex(idx) / ck::packed_size_v<ck::remove_cvref_t<T>>];
    }
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
@@ -122,7 +122,6 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
    using Base::B_K1;
    using Base::I0;
    using Base::I1;
-    using Base::KGroup;
    using Base::KRepeat;
    using Base::xdlops_gemm;
    using typename Base::HotLoopInstList;
@@ -154,9 +153,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
-        constexpr index_t K2 = KPack / KGroup;
+        constexpr index_t K2 = KPack;
        constexpr index_t K1 = 64 / NPerXDL;
-        constexpr index_t K0 = KRepeat * KGroup;
+        constexpr index_t K0 = KRepeat;

        return transform_tensor_descriptor(
            TileDesc_M0_M1_M2_K{},
@@ -291,14 +290,12 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
        block_sync_lds();
        static_for<0, MRepeat, 1>{}([&](auto m0) {
            static_for<0, KRepeat, 1>{}([&](auto k0) {
-                static_for<0, KGroup, 1>{}([&](auto kg0) {
-                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                       make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
-                                       a_block_buf,
-                                       a_thread_desc_,
-                                       make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                                       a_thread_buf);
-                });
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_buf);
            });
        });
        // B VGPR->VGPR dequant
@@ -391,15 +388,12 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<

                    static_for<0, MRepeat, 1>{}([&](auto m0) {
                        static_for<0, KRepeat, 1>{}([&](auto k0) {
-                            static_for<0, KGroup, 1>{}([&](auto kg0) {
-                                a_thread_copy_.Run(
-                                    a_block_desc_m0_m1_m2_k0_k1_k2,
-                                    make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
-                                    a_block_buf,
-                                    a_thread_desc_,
-                                    make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                                    a_thread_buf);
-                            });
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_buf);
                        });
                    });
                    // B VGPR->VGPR dequant
@@ -483,14 +477,12 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<

            static_for<0, MRepeat, 1>{}([&](auto m0) {
                static_for<0, KRepeat, 1>{}([&](auto k0) {
-                    static_for<0, KGroup, 1>{}([&](auto kg0) {
-                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
-                                           make_tuple(m0, I0, I0, Number<k0 * 2 + kg0>{}, I0, I0),
-                                           a_block_buf,
-                                           a_thread_desc_,
-                                           make_tuple(m0, I0, I0, k0, I0, Number<kg0 * A_K1>{}),
-                                           a_thread_buf);
-                    });
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_buf);
                });
            });
            // B VGPR->VGPR dequant
@@ -596,7 +588,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_gufusion_bdequant_v1<
                                                         ComputeDataType,
                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
                                                         decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, 1, 1, KPack / KGroup>,
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
                                                         Sequence<0, 1, 2, 3, 4, 5>,
                                                         5,
                                                         A_K1,
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp
@@ -0,0 +1,759 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename ComputePtrOffsetOfStridedBatch,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_batched_gemm_wmma_cshuffle_v3(
+            typename GridwiseGemm::Argument
+                karg, // This works for now but it actually receives a
+                      // DeviceBatchedGemm_Wmma_CShuffleV3::Argument
+                      // argument through implicit conversion to base class!
+            const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx11__) || defined(__gfx12__))
+#if defined(__gfx11__)
+    // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+    using c_data_type = remove_cvref_t<remove_pointer_t<decltype(karg.p_c_grid)>>;
+    if constexpr(!(CGlobalMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd &&
+                   (std::is_same_v<c_data_type, ck::half_t> ||
+                    std::is_same_v<c_data_type, ck::bhalf_t>)))
+    {
+#endif
+        // The normal approach to batching would be to increase the grid size by just stretching out
+        // the grid Z dimension (which is the outermost dimension), but this depends on lower level
+        // functions not directly using the Z dimension for other calculations. As it turns out, k
+        // batching does rely directly on blockIdx.Z through SplitKBatchOffset. Therefore, for now
+        // we will use the grid Y dimension for batching. This may be a bit fragile.
+        const index_t g_idx = amd_wave_read_first_lane(blockIdx.y);
+
+        const long_index_t a_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+        const long_index_t b_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+        const long_index_t c_batch_offset =
+            amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx));
+
+        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg);
+
+        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+            karg.p_a_grid + splitk_batch_offset.a_k_split_offset + a_batch_offset,
+            karg.p_b_grid + splitk_batch_offset.b_k_split_offset + b_batch_offset,
+            karg.p_c_grid + splitk_batch_offset.c_reduce_offset + c_batch_offset,
+            p_shared,
+            karg);
+#if defined(__gfx11__)
+    }
+#endif
+#else
+    ignore = karg;
+    ignore = batch;
+    ignore = compute_ptr_offset_of_batch;
+#endif
+}
+
+/// @brief \"Universal\" Batched GEMM operation without SplitK support.
+///
+/// @par Overview
+///         This GEMM operation implements the following mathematical equation:
+///         C{G,M,N} = C_op(A_op(A{G,M,K}) * B_op(B{G,K,N}))
+///         Where A, B are input tensors and C is the output tensor. The A/B/C_op are
+///         elementwise operations applied to the A, B, and C tensors, respectively.
+///         The \"universal\" gemm comes with multiple pipelines optimized for different usage
+///         scenarios. That's why it's called \"universal\". It's universal through its design
+///         and versatilty.
+///
+/// @note   This Kernel implementation currently does not support the SplitK algorithm.
+///
+/// @tparam ALayout     A tensor data layout.
+/// @tparam BLayout     B tensor data layout.
+/// @tparam CLayout     C tensor data layout.
+/// @tparam ADataType   A tensor data type.
+/// @tparam BDataType   B tensor data type.
+/// @tparam CDataType   C tensor data type.
+/// @tparam AccDataType The accumulation data type related to the hardware
+///                         matrix-multiplication instruction.
+/// @tparam CShuffleDataType The data type used to store matrix-multiplication results into
+///                          LDS memory during \"CShuffle\" data layout optimization.
+/// @tparam AElementwiseOperation Elementwise operation applied to the A input tensor elements.
+/// @tparam BElementwiseOperation Elementwise operation applied to the B input tensor elements.
+/// @tparam CElementwiseOperation Elementwise operation applied to the C output tensor
+///                               (after GEMM).
+/// @tparam GemmSpec    Determines used "padding" version.
+/// @tparam BlockSize   The number of threads within workgroup.
+/// @tparam MPerBlock   The input/output data tile size in the M dimension.
+/// @tparam NPerBlock   The input/output data tile size in the N dimension.
+/// @tparam KPerBlock   The input data tile size in the K dimension.
+/// @tparam AK1         The vector load size from global memory for A tensor.
+/// @tparam BK1         The vector load size from global memory for B tensor.
+/// @tparam MPerWmma    M size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam NPerWmma    N size of Wave Matrix Multiply Accumulate (WMMA) instruction.
+/// @tparam MRepeat     The number of iterations in the M dimension over output tile per wavefront.
+/// @tparam NRepeat     The number of iterations in the N dimension over output tile per wavefront.
+/// @tparam ABlockTransferThreadClusterLengths_AK0_M_AK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question, "How many threads can be
+///                                                      arranged on each input data axis?"
+/// @tparam ABlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam ABlockTransferSrcAccessOrder The order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam ABlockTransferSrcVectorDim   The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam ABlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam ABlockTransferDstScalarPerVector_AK1 The size of vectorized store into LDS memory.
+/// @tparam ABlockLdsExtraM                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam BBlockTransferThreadClusterLengths_BK0_N_BK1 Spatial thread distribution over the input
+///                                                      data. Can be interpreted as the answer
+///                                                      to the question: "How many threads to
+///                                                      arrange on each input data axis?"
+/// @tparam BBlockTransferThreadClusterArrangeOrder The order of thread spatial distribution over
+///                                                 the input tensor dimension. Can be interpreted
+///                                                 as the answer to the question: "In which
+///                                                 order to spread threads through tensor axes?".
+/// @tparam BBlockTransferSrcAccessOrder he order of accessing input tensor axes. Can be
+///                                      interpreted as the answer to the question "Which dimension
+///                                      to read first? And which next?" etc.
+/// @tparam BBlockTransferSrcVectorDim  The index of axis on which we could do vectorized memory
+///                                      access - the one with contiguous memory.
+/// @tparam BBlockTransferSrcScalarPerVector The size of vector access instruction - the number of
+///                                          elements accessed per thread per instruction.
+/// @tparam BBlockTransferDstScalarPerVector_BK1 The size of vectorized store into LDS memory.
+/// @tparam BBlockLdsExtraN                      Whether to use padding for LDS or not. With
+///                                              universal GEMM there's no need for padding.
+/// @tparam CShuffleMRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in M dimension.
+/// @tparam CShuffleNRepeatPerShuffle   The number of matrix-multiplication instructions
+///                                         results to process per wave per iteration of CShuffle
+///                                         in N dimension.
+/// @tparam CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock The spatial
+///                                         thread distribution used for storing data into output
+///                                         tensor across output data layout dimensions.
+/// @tparam CShuffleBlockTransferScalarPerVector_NPerBlock The size of vectorized memory access.
+///                                         Used when storing data to output tensor.
+/// @tparam BlkGemmPipeSched    The version of blockwise-gemm pipeline scheduler (interwave or
+///                             intrawave).
+/// @tparam BlkGemmPipelineVer  The version of blockwise-gemm pipeline.
+/// @tparam ComputeTypeA    Data type used for A input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam ComputeTypeB    Data type used for B input of hardware matrix-multiplication
+///                         instructions.
+/// @tparam PermuteA            Whether the A input tensor has gridwise-gemm friendly data layout
+///                             in global memory. Currently not supported!
+/// @tparam PermuteB            Whether the B input tensor has gridwise-gemm friendly data layout
+///                             in global memory (pre-shuffled). Currently not supported!
+template <typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename CDataType,
+          typename AccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerWmma,
+          index_t NPerWmma,
+          index_t MRepeat,
+          index_t NRepeat,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMRepeatPerShuffle,
+          index_t CShuffleNRepeatPerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          bool PermuteA                               = false,
+          bool PermuteB                               = false>
+struct DeviceBatchedGemm_Wmma_CShuffleV3 : public DeviceBatchedGemm<ALayout,
+                                                                    BLayout,
+                                                                    CLayout,
+                                                                    ADataType,
+                                                                    BDataType,
+                                                                    CDataType,
+                                                                    AElementwiseOperation,
+                                                                    BElementwiseOperation,
+                                                                    CElementwiseOperation>
+{
+    // We are inheriting from DeviceBatchedGemm and this base class does not support permuteA and
+    // permuteB arguments so for now we are not including this functionality.
+    static_assert(PermuteA == false,
+                  "Permute A functionality not supported by DeviceBatchedGemm operations.\n");
+    static_assert(PermuteB == false,
+                  "Permute B functionality not supported by DeviceBatchedGemm operations.\n");
+
+    struct ComputePtrOffsetOfStridedBatch
+    {
+        ComputePtrOffsetOfStridedBatch(index_t BatchStrideA,
+                                       index_t BatchStrideB,
+                                       index_t BatchStrideC)
+            : BatchStrideA_(BatchStrideA), BatchStrideB_(BatchStrideB), BatchStrideC_(BatchStrideC)
+        {
+        }
+
+        __host__ __device__ constexpr long_index_t GetAPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideA_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetBPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideB_);
+        }
+
+        __host__ __device__ constexpr long_index_t GetCPtrOffset(index_t g_idx) const
+        {
+            return g_idx * static_cast<long_index_t>(BatchStrideC_);
+        }
+
+        private:
+        index_t BatchStrideA_;
+        index_t BatchStrideB_;
+        index_t BatchStrideC_;
+    };
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemm_wmma_cshuffle_v3<
+        ALayout,
+        BLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        AccDataType,
+        CShuffleDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerWmma,
+        NPerWmma,
+        MRepeat,
+        NRepeat,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMRepeatPerShuffle,
+        CShuffleNRepeatPerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CShuffleBlockTransferScalarPerVector_NPerBlock,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        false,  // PermuteA not supported by DeviceBatchedGemm base class.
+        false>; // PermuteB not supported by DeviceBatchedGemm base class.
+
+    // Argument
+    struct Argument : public GridwiseGemm::Argument
+    {
+        __host__ Argument(const ADataType* p_a_grid_,
+                          const BDataType* p_b_grid_,
+                          CDataType* p_c_grid_,
+                          index_t M_,
+                          index_t N_,
+                          index_t K_,
+                          index_t StrideA_,
+                          index_t StrideB_,
+                          index_t StrideC_,
+                          index_t BatchStrideA_,
+                          index_t BatchStrideB_,
+                          index_t BatchStrideC_,
+                          index_t Batch_,
+                          index_t k_batch_,
+                          bool is_reduce_ = false)
+            : GridwiseGemm::Argument(p_a_grid_,
+                                     p_b_grid_,
+                                     p_c_grid_,
+                                     M_,
+                                     N_,
+                                     K_,
+                                     StrideA_,
+                                     StrideB_,
+                                     StrideC_,
+                                     k_batch_,
+                                     is_reduce_),
+              Batch(Batch_),
+              compute_ptr_offset_of_batch{BatchStrideA_, BatchStrideB_, BatchStrideC_}
+        {
+        }
+
+        index_t Batch;
+        ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch;
+    };
+
+    /// @brief  Helper structure responsible for kernel invocation.
+    ///
+    /// @paragraph  The `Invoker` class is responsible for preparation and invocation of actual GPU
+    ///             kernel function. It usually determines the launched grid size prepares kernel
+    ///             arguments as well as perform specific kernel configuration selection based on
+    ///             runtime arguments.
+    ///
+    /// @note       If appropriately configured it may measure kernel execution time.
+    ///
+    struct Invoker : public BaseInvoker
+    {
+        /// @brief  This function issues GPU kernel execution.
+        /// @param arg           The GPU kernel arguments.
+        /// @param stream_config The HIP stream configuration helper structure.
+        /// @return              The kernel's average execution time (if time measurement is
+        ///                      enabled).
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+                GridwiseGemm::BlockwiseGemmPipe::HotLoopInstList::Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            // The normal approach to batching would be to increase the grid size by just stretching
+            // out the grid Z dimension (which is the outermost dimension), but this depends on
+            // lower level functions not directly using the Z dimension for other calculations. As
+            // it turns out, k batching does rely directly on blockIdx.Z through SplitKBatchOffset.
+            // Therefore, for now we will use the grid Y dimension for batching. This may be a bit
+            // fragile.
+            gdy *= arg.Batch;
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    // Packed sizes are 1 for all implemented data types but we include it anyway
+                    // for future compatibility.
+                    auto size_a_buffer = a_grid_desc_ak0_m_ak1.GetElementSpaceSize() *
+                                         sizeof(ADataType) / GridwiseGemm::APackedSize;
+                    auto size_b_buffer = b_grid_desc_bk0_n_bk1.GetElementSpaceSize() *
+                                         sizeof(BDataType) / GridwiseGemm::BPackedSize;
+
+                    // Note: the grid descriptors and size_a / size_b do *not* take batching into
+                    // account, so we have to manually multiply overall buffer sizes for rotating
+                    // memory by batch.
+                    ck::utility::RotatingMemWrapper<Argument> rotating_mem(
+                        arg_,
+                        stream_config.rotating_count,
+                        arg_.Batch * size_a_buffer,
+                        arg_.Batch * size_b_buffer);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            // Note: we multiply by batch since we want to clear the C matrix for
+                            // the whole batch. Untested since we don't have k batching ATM.
+                            // Note: This seems incorrect for non-contiguous memory layouts for C
+                            // (padding, gaps).
+                            HIP_CHECK_ERROR(
+                                hipMemsetAsync(arg_.p_c_grid,
+                                               0,
+                                               arg_.Batch * arg_.M * arg_.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_,
+                        arg_.compute_ptr_offset_of_batch);
+                }
+                else
+                {
+                    auto clear_workspace = [&]() {
+                        // clear c mem
+                        if(arg.KBatch > 1)
+                            // Note: we multiply by batch since we want to clear the C matrix for
+                            // the whole batch. Untested since we don't have k batching ATM.
+                            // Note: This seems incorrect for non-contiguous memory layouts for C
+                            // (padding, gaps).
+                            HIP_CHECK_ERROR(
+                                hipMemsetAsync(arg.p_c_grid,
+                                               0,
+                                               arg.Batch * arg.M * arg.N * sizeof(CDataType),
+                                               stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        clear_workspace,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg,
+                        arg.compute_ptr_offset_of_batch);
+                }
+            };
+
+            constexpr index_t minimum_occupancy = []() {
+                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
+                {
+                    return 2;
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
+                }
+                else
+                {
+                    return 1;
+                }
+            }();
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_batched_gemm_wmma_cshuffle_v3<
+                            GridwiseGemm,
+                            ComputePtrOffsetOfStridedBatch,
+                            true,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_batched_gemm_wmma_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<ComputePtrOffsetOfStridedBatch>,
+                            true,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                else
+                {
+                    // TODO: Implement
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_batched_gemm_wmma_cshuffle_v3<
+                            GridwiseGemm,
+                            ComputePtrOffsetOfStridedBatch,
+                            false,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_batched_gemm_wmma_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<ComputePtrOffsetOfStridedBatch>,
+                            false,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_gfx11_supported() && !ck::is_gfx12_supported())
+        {
+            return false;
+        }
+
+        if constexpr(std::is_same_v<CDataType, ck::half_t> ||
+                     std::is_same_v<CDataType, ck::bhalf_t>)
+        {
+            if(arg.KBatch > 1 && ck::is_gfx11_supported())
+            {
+                // gfx11 does not support *_atomic_pk_add_f16/bf16 instructions
+                return false;
+            }
+        }
+
+        if constexpr(std::is_same_v<ComputeTypeA, f8_t> || std::is_same_v<ComputeTypeA, bf8_t> ||
+                     std::is_same_v<ComputeTypeB, f8_t> || std::is_same_v<ComputeTypeB, bf8_t>)
+        {
+            if(ck::is_gfx11_supported())
+            {
+                return false;
+            }
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    // TODO: This is not part of the DeviceBatchedGemm base class but it was part of
+    // DeviceBatchedGemmV2. Remove?
+    // index_t GetKPerBlock() override { return KPerBlock; }
+    // bool GetPermuteA() override { return PermuteA; }
+    // bool GetPermuteB() override { return PermuteB; }
+
+    static auto MakeArgument(const ADataType* p_a,
+                             const BDataType* p_b,
+                             CDataType* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             index_t StrideC,
+                             index_t BatchStrideA,
+                             index_t BatchStrideB,
+                             index_t BatchStrideC,
+                             index_t Batch,
+                             AElementwiseOperation,
+                             BElementwiseOperation,
+                             CElementwiseOperation)
+    {
+        return Argument{p_a,
+                        p_b,
+                        p_c,
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideC,
+                        BatchStrideA,
+                        BatchStrideB,
+                        BatchStrideC,
+                        Batch,
+                        1 /* KBatch */};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      index_t StrideC,
+                                                      index_t BatchStrideA,
+                                                      index_t BatchStrideB,
+                                                      index_t BatchStrideC,
+                                                      index_t Batch,
+                                                      AElementwiseOperation,
+                                                      BElementwiseOperation,
+                                                      CElementwiseOperation) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideC,
+                                          BatchStrideA,
+                                          BatchStrideB,
+                                          BatchStrideC,
+                                          Batch,
+                                          1); // KBatch
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"},
+            {BlockGemmPipelineVersion::v4, "v4"},
+            {BlockGemmPipelineVersion::v5, "v5"}};
+
+        // clang-format off
+        str << "DeviceBatchedGemm_Wmma_CShuffleV3"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock << "x" << NPerBlock << "x" << KPerBlock << ", "
+            << "WaveTile: "
+            << MPerWmma << "x"<<NPerWmma << ", "
+            << "WaveMap: "
+            << MRepeat << "x" << NRepeat << ", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector << "x" << BBlockTransferSrcScalarPerVector << ", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages << ", "
+            << "KPack: "
+            << GridwiseGemm::KPack;
+        // clang-format on
+
+        return str.str();
+    }
+    REGISTER_EXTRA_PRINTING_METHODS
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -77,7 +77,8 @@ template <typename GridwiseGemm,
          typename ComputePtrOffsetOfN,
          bool HasMainKBlockLoop,
          bool isMultiA,
-          bool isMultiB>
+          bool isMultiB,
+          bool CTranspose>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
@@ -171,17 +172,22 @@ __global__ void
    }
    else
    {
-        const long_index_t a_group_offset =
-            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
        const long_index_t b_group_offset =
-            amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
-
+            CTranspose
+                ? amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx))
+                : amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx));
+        const long_index_t a_group_offset =
+            CTranspose
+                ? amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetBPtrOffset(g_idx))
+                : amd_wave_read_first_lane(compute_ptr_offset_of_groups.GetAPtrOffset(g_idx));
+        const long_index_t b_n_offset =
+            CTranspose ? amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx)) : 0;
        const long_index_t a_n_offset =
-            amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));
+            CTranspose ? 0 : amd_wave_read_first_lane(compute_ptr_offset_of_n.GetAPtrOffset(n_idx));

        GridwiseGemm::template Run<HasMainKBlockLoop, InMemoryDataOperationEnum::Set>(
            p_as_grid + a_group_offset + a_n_offset,
-            p_bs_grid + b_group_offset,
+            p_bs_grid + b_group_offset + b_n_offset,
            p_ds_grid_grp,
            p_e_grid + e_group_offset + e_n_offset,
            p_shared,
@@ -335,12 +341,28 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    static constexpr auto I4 = Number<4>{};
    static constexpr auto I5 = Number<5>{};

+    static constexpr bool isATensorColMajor =
+        (ConvForwardSpecialization == ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) &&
+        (ABlockTransferSrcVectorDim == 1) && (NumGroupsToMerge == 1) &&
+        (is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>());
+
+    static constexpr bool NeedTransposeKernel =
+        (isATensorColMajor == false) && (is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
+                                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>());
+
+    static constexpr bool CTranspose = (NeedTransposeKernel == false) && (isMultiAB == false) &&
+                                       (is_same_v<ELayout, tensor_layout::convolution::NGKHW> ||
+                                        is_same_v<ELayout, tensor_layout::convolution::NGKDHW>);
+
    using ConvToGemmFwdTransformer = TransformConvFwdToGemm<NDimSpatial,
                                                            ConvForwardSpecialization,
                                                            true /*SplitN*/,
                                                            ADataType,
                                                            EDataType,
-                                                            NumGroupsToMerge>;
+                                                            NumGroupsToMerge,
+                                                            index_t,
+                                                            CTranspose>;

    static constexpr index_t ClusterLengthNPerBlock =
        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(3);
@@ -361,9 +383,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    {
        namespace ctc = tensor_layout::convolution;
        using Layout  = std::conditional_t<
-            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
            ctc::NHWGC,
-            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>(), ctc::NDHWGC, ALay>>;
+            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+                               ctc::NDHWGC,
+                               ALay>>;

        const auto in_gemmmraw_gemmkraw_desc =
            conv_to_gemm_transformer.template MakeADescriptor_M_K<Layout>();
@@ -379,9 +403,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    {
        namespace ctc = tensor_layout::convolution;
        using Layout  = std::conditional_t<
-            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
            ctc::GKYXC,
-            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>(), ctc::GKZYXC, BLay>>;
+            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+                               ctc::GKZYXC,
+                               BLay>>;

        const auto wei_gemmnraw_gemmkraw_desc =
            conv_to_gemm_transformer.template MakeBDescriptor_N_K<Layout>();
@@ -397,17 +423,24 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    {
        namespace ctc = tensor_layout::convolution;
        using Layout  = std::conditional_t<
-            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>(),
+            is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
            ctc::NHWGK,
-            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>(), ctc::NDHWGK, ELay>>;
+            std::conditional_t<is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>() && NeedTransposeKernel,
+                               ctc::NDHWGK,
+                               ELay>>;

        const auto out_gemmmraw_gemmnraw_desc =
            conv_to_gemm_transformer.template MakeCDescriptor_M_N<Layout>();
-
-        const auto out_gemmm_gemmn_desc =
-            matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
-
-        return out_gemmm_gemmn_desc;
+        if constexpr(CTranspose)
+        {
+            constexpr auto matrix_padder_trans =
+                MatrixPadder<GemmSpec, index_t, index_t, index_t>{NPerBlock, MPerBlock, KPerBlock};
+            return matrix_padder_trans.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
+        }
+        else
+        {
+            return matrix_padder.PadCDescriptor_M_N(out_gemmmraw_gemmnraw_desc);
+        }
    }

    // Shape of Ds and E must be aligned. Strides can be different.
@@ -471,11 +504,32 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,             \
        BComputeDataType, DoElementwiseBeforeCShuffle
+
+#define GridwiseGemmCTransposeTemplateParameters                                               \
+    GemmBDataType, GemmADataType, AComputeDataType, AccDataType, CShuffleDataType, DsDataType, \
+        EDataType, BElementwiseOperation, AElementwiseOperation, CDEElementwiseOperation,      \
+        NumGemmKPrefetchStage, BlockSize, NPerBlock, MPerBlock, KPerBlock, BK1, AK1, NPerXDL,  \
+        MPerXDL, NXdlPerWave, MXdlPerWave, BBlockTransferThreadClusterLengths_BK0_N_BK1,       \
+        BBlockTransferThreadClusterArrangeOrder, BBlockTransferSrcAccessOrder,                 \
+        BBlockTransferSrcVectorDim, BBlockTransferSrcScalarPerVector,                          \
+        BBlockTransferDstScalarPerVector_BK1, false, BBlockLdsExtraN,                          \
+        ABlockTransferThreadClusterLengths_AK0_M_AK1, ABlockTransferThreadClusterArrangeOrder, \
+        ABlockTransferSrcAccessOrder, ABlockTransferSrcVectorDim,                              \
+        ABlockTransferSrcScalarPerVector, ABlockTransferDstScalarPerVector_AK1, false,         \
+        ABlockLdsExtraM, CShuffleMXdlPerWavePerShuffle, CShuffleNXdlPerWavePerShuffle,         \
+        CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,                      \
+        CDEBlockTransferScalarPerVector_NPerBlock, LoopSched, PipelineVersion::v1,             \
+        BComputeDataType, DoElementwiseBeforeCShuffle
+
    // Use appropriate gridwise gemm
    using GridwiseGemm = std::conditional_t<
        isMultiA || isMultiB,
        GridwiseGemmMultipleABD_xdl_cshuffle<GridwiseGemmMultiABDTemplateParameters>,
        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmTemplateParameters>>;
+    using GridwiseGemmCTranspose = std::conditional_t<
+        CTranspose,
+        GridwiseGemmMultipleD_xdl_cshuffle<GridwiseGemmCTransposeTemplateParameters>,
+        GridwiseGemm>;

    // If ADataTypes or BDataTypes is tuple, user has to pass std::array with pointers.
    using APointers =
@@ -497,15 +551,16 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(
            BGridDesc_N_K{}))>;
    using DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
-        decltype(GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+        decltype(GridwiseGemmCTranspose::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
            DsGridDesc_M_N{}))>;
-    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
-        remove_cvref_t<decltype(GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+    using EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock = remove_cvref_t<
+        decltype(GridwiseGemmCTranspose::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
            EGridDesc_M_N{}))>;

    // block-to-e-tile map
    using Block2ETileMap =
-        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
+        remove_cvref_t<decltype(GridwiseGemmCTranspose::MakeDefaultBlock2ETileMap(
+            EGridDesc_M_N{}))>;
    using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<NPerBlock, NPerBlock>;

    using NGCHWTransposeDescType =
@@ -612,16 +667,22 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
              p_ds_grid_{},
              p_e_grid_{static_cast<EDataType*>(p_e)},
              a_g_n_c_wis_lengths_{a_g_n_c_wis_lengths},
-              a_g_n_c_wis_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
-                  a_g_n_c_wis_lengths, a_g_n_c_wis_strides)},
+              a_g_n_c_wis_strides_{NeedTransposeKernel
+                                       ? conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
+                                             a_g_n_c_wis_lengths, a_g_n_c_wis_strides)
+                                       : a_g_n_c_wis_strides},
              b_g_k_c_xs_lengths_{b_g_k_c_xs_lengths},
-              b_g_k_c_xs_strides_{conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(
-                  b_g_k_c_xs_lengths, b_g_k_c_xs_strides)},
+              b_g_k_c_xs_strides_{NeedTransposeKernel
+                                      ? conv_ngchw_to_nhwgc_transformer.TransposeWeiStrides(
+                                            b_g_k_c_xs_lengths, b_g_k_c_xs_strides)
+                                      : b_g_k_c_xs_strides},
              ds_g_n_k_wos_lengths_{ds_g_n_k_wos_lengths},
              ds_g_n_k_wos_strides_{ds_g_n_k_wos_strides},
              e_g_n_k_wos_lengths_{e_g_n_k_wos_lengths},
-              e_g_n_k_wos_strides_{conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
-                  e_g_n_k_wos_lengths, e_g_n_k_wos_strides)},
+              e_g_n_k_wos_strides_{NeedTransposeKernel
+                                       ? conv_ngchw_to_nhwgc_transformer.TransposeInOutStrides(
+                                             e_g_n_k_wos_lengths, e_g_n_k_wos_strides)
+                                       : e_g_n_k_wos_strides},
              conv_filter_strides_{conv_filter_strides},
              conv_filter_dilations_{conv_filter_dilations},
              input_left_pads_{input_left_pads},
@@ -651,7 +712,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                  GridwiseGemm::MakeDefaultBGridDescriptor_BK0_N_BK1(b_grid_desc_n_k_)},
              ds_grid_desc_mblock_mperblock_nblock_nperblock_{},
              e_grid_desc_mblock_mperblock_nblock_nperblock_{},
-              block_2_etile_map_{GridwiseGemm::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
+              block_2_etile_map_{
+                  GridwiseGemmCTranspose::MakeDefaultBlock2ETileMap(e_grid_desc_m_n_)},
              compute_ptr_offset_of_groups_{},
              compute_ptr_offset_of_n_{},
              a_element_op_{a_element_op},
@@ -783,24 +845,34 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
            }
            else
            {
-                if(GridwiseGemm::CheckValidity(a_grid_desc_m_k_,
-                                               b_grid_desc_n_k_,
-                                               ds_grid_desc_m_n_,
-                                               e_grid_desc_m_n_,
-                                               block_2_etile_map_))
+                bool valid = false;
+                if constexpr(CTranspose)
                {
-                    e_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            e_grid_desc_m_n_);
+                    valid = GridwiseGemmCTranspose::CheckValidity(b_grid_desc_n_k_,
+                                                                  a_grid_desc_m_k_,
+                                                                  ds_grid_desc_m_n_,
+                                                                  e_grid_desc_m_n_,
+                                                                  block_2_etile_map_);
+                }
+                else
+                {
+                    valid = GridwiseGemmCTranspose::CheckValidity(a_grid_desc_m_k_,
+                                                                  b_grid_desc_n_k_,
+                                                                  ds_grid_desc_m_n_,
+                                                                  e_grid_desc_m_n_,
+                                                                  block_2_etile_map_);
+                }
+                if(valid)
+                {
+                    e_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmCTranspose::
+                        MakeEGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(e_grid_desc_m_n_);

-                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ =
-                        GridwiseGemm::MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                            ds_grid_desc_m_n_);
+                    ds_grid_desc_mblock_mperblock_nblock_nperblock_ = GridwiseGemmCTranspose::
+                        MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(ds_grid_desc_m_n_);
                }
            }

-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
            {
                // Use not modified base strides
                a_in_transpose_desc_ =
@@ -835,8 +907,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle

        std::size_t GetWorkspaceATensorSizeBytes() const
        {
-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
            {
                const long_index_t a_acum = ck::accumulate_n<long_index_t>(
                    a_g_n_c_wis_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
@@ -851,8 +922,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle

        std::size_t GetWorkspaceBTensorSizeBytes() const
        {
-            if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
            {
                const long_index_t b_acum = ck::accumulate_n<long_index_t>(
                    b_g_k_c_xs_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
@@ -867,8 +937,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle

        std::size_t GetWorkspaceETensorSizeBytes() const
        {
-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
            {
                const long_index_t e_accum = ck::accumulate_n<long_index_t>(
                    e_g_n_k_wos_lengths_.begin(), NDimSpatial + I3, 1, std::multiplies<>());
@@ -1007,7 +1076,8 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                        ComputePtrOffsetOfStridedBatch<NumATensor, I1, NumDTensor>,
                        has_main_loop,
                        isMultiA,
-                        isMultiB>;
+                        isMultiB,
+                        CTranspose>;

                    return launch_and_time_kernel(
                        stream_config,
@@ -1035,68 +1105,118 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                    const ADataType* p_a_grid = arg.p_as_grid_.At(I0);
                    const BDataType* p_b_grid = arg.p_bs_grid_.At(I0);
                    EDataType* p_e_grid       = arg.p_e_grid_;
-
-                    if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
-                                 is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+                    if constexpr(NeedTransposeKernel)
                    {
-                        p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
-                        p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
-                                   arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
-                        p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
-                                   (arg.GetWorkspaceATensorSizeBytes() +
-                                    arg.GetWorkspaceBTensorSizeBytes()) /
-                                       sizeof(EDataType);
-                    }
-                    else if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
-                                      is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
-                    {
-                        p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
-                        p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
-                                   (arg.GetWorkspaceATensorSizeBytes() +
-                                    arg.GetWorkspaceBTensorSizeBytes()) /
-                                       sizeof(EDataType);
+                        if constexpr(is_NGCHW_GKCYX_NGKHW<ALayout, BLayout, ELayout>() ||
+                                     is_NGCDHW_GKCZYX_NGKDHW<ALayout, BLayout, ELayout>())
+                        {
+                            p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                            p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                                       arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+                            p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
+                                       (arg.GetWorkspaceATensorSizeBytes() +
+                                        arg.GetWorkspaceBTensorSizeBytes()) /
+                                           sizeof(EDataType);
+                        }
+                        else if constexpr(is_NGCHW_GKYXC_NGKHW<ALayout, BLayout, ELayout>() ||
+                                          is_NGCDHW_GKZYXC_NGKDHW<ALayout, BLayout, ELayout>())
+                        {
+                            p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                            p_e_grid = type_convert<EDataType*>(arg.p_workspace_) +
+                                       (arg.GetWorkspaceATensorSizeBytes() +
+                                        arg.GetWorkspaceBTensorSizeBytes()) /
+                                           sizeof(EDataType);
+                        }
                    }

-                    const auto kernel = kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
-                        GridwiseGemm,
-                        const ADataType*,
-                        const BDataType*,
-                        typename GridwiseGemm::DsGridPointer,
-                        EDataType,
-                        AElementwiseOperation,
-                        BElementwiseOperation,
-                        CDEElementwiseOperation,
-                        DeviceOp::AGridDesc_AK0_M_AK1,
-                        DeviceOp::BGridDesc_BK0_N_BK1,
-                        DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                        DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
-                        Block2ETileMap,
-                        ComputePtrOffsetOfStridedBatch<NumATensor, NumBTensor, NumDTensor>,
-                        ComputePtrOffsetOfStridedBatch<NumATensor, I1, NumDTensor>,
-                        has_main_loop,
-                        isMultiA,
-                        isMultiB>;
+                    if constexpr(CTranspose)
+                    {
+                        const auto kernel = kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
+                            GridwiseGemmCTranspose,
+                            const BDataType*,
+                            const ADataType*,
+                            typename GridwiseGemm::DsGridPointer,
+                            EDataType,
+                            BElementwiseOperation,
+                            AElementwiseOperation,
+                            CDEElementwiseOperation,
+                            DeviceOp::BGridDesc_BK0_N_BK1,
+                            DeviceOp::AGridDesc_AK0_M_AK1,
+                            DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                            Block2ETileMap,
+                            ComputePtrOffsetOfStridedBatch<NumATensor, NumBTensor, NumDTensor>,
+                            ComputePtrOffsetOfStridedBatch<NumATensor, I1, NumDTensor>,
+                            has_main_loop,
+                            isMultiA,
+                            isMultiB,
+                            CTranspose>;

-                    return launch_and_time_kernel(
-                        stream_config,
-                        kernel,
-                        dim3(gdx, gdy, gdz),
-                        dim3(BlockSize),
-                        0,
-                        p_a_grid,
-                        p_b_grid,
-                        arg.p_ds_grid_,
-                        p_e_grid,
-                        arg.a_element_op_,
-                        arg.b_element_op_,
-                        arg.cde_element_op_,
-                        arg.a_grid_desc_ak0_m_ak1_,
-                        arg.b_grid_desc_bk0_n_bk1_,
-                        arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
-                        arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
-                        arg.block_2_etile_map_,
-                        arg.compute_ptr_offset_of_groups_,
-                        arg.compute_ptr_offset_of_n_);
+                        return launch_and_time_kernel(
+                            stream_config,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            p_b_grid,
+                            p_a_grid,
+                            arg.p_ds_grid_,
+                            p_e_grid,
+                            arg.b_element_op_,
+                            arg.a_element_op_,
+                            arg.cde_element_op_,
+                            arg.b_grid_desc_bk0_n_bk1_,
+                            arg.a_grid_desc_ak0_m_ak1_,
+                            arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.block_2_etile_map_,
+                            arg.compute_ptr_offset_of_groups_,
+                            arg.compute_ptr_offset_of_n_);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_grouped_conv_fwd_multiple_abd_xdl_cshuffle<
+                            GridwiseGemm,
+                            const ADataType*,
+                            const BDataType*,
+                            typename GridwiseGemm::DsGridPointer,
+                            EDataType,
+                            AElementwiseOperation,
+                            BElementwiseOperation,
+                            CDEElementwiseOperation,
+                            DeviceOp::AGridDesc_AK0_M_AK1,
+                            DeviceOp::BGridDesc_BK0_N_BK1,
+                            DeviceOp::DsGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                            DeviceOp::EGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                            Block2ETileMap,
+                            ComputePtrOffsetOfStridedBatch<NumATensor, NumBTensor, NumDTensor>,
+                            ComputePtrOffsetOfStridedBatch<NumATensor, I1, NumDTensor>,
+                            has_main_loop,
+                            isMultiA,
+                            isMultiB,
+                            CTranspose>;
+
+                        return launch_and_time_kernel(
+                            stream_config,
+                            kernel,
+                            dim3(gdx, gdy, gdz),
+                            dim3(BlockSize),
+                            0,
+                            p_a_grid,
+                            p_b_grid,
+                            arg.p_ds_grid_,
+                            p_e_grid,
+                            arg.a_element_op_,
+                            arg.b_element_op_,
+                            arg.cde_element_op_,
+                            arg.a_grid_desc_ak0_m_ak1_,
+                            arg.b_grid_desc_bk0_n_bk1_,
+                            arg.ds_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.e_grid_desc_mblock_mperblock_nblock_nperblock_,
+                            arg.block_2_etile_map_,
+                            arg.compute_ptr_offset_of_groups_,
+                            arg.compute_ptr_offset_of_n_);
+                    }
                }
            };

@@ -1114,8 +1234,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
        {
            float avg_time = 0.f;

-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
            {
                const index_t a_grid_size =
                    arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
@@ -1166,8 +1285,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle

            avg_time += RunGemm(arg, stream_config);

-            if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                         is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+            if constexpr(NeedTransposeKernel)
            {
                const index_t grid_size =
                    arg.elementwise_block_2_ctile_map_transpose_e_.CalculateGridSize(
@@ -1215,9 +1333,11 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
    {
        namespace ctc = tensor_layout::convolution;

-        const index_t G = arg.b_g_k_c_xs_lengths_[I0];
-        const index_t K = arg.b_g_k_c_xs_lengths_[I1];
-        const index_t C = arg.b_g_k_c_xs_lengths_[I2];
+        const index_t G                  = arg.b_g_k_c_xs_lengths_[I0];
+        const index_t K                  = arg.b_g_k_c_xs_lengths_[I1];
+        const index_t C                  = arg.b_g_k_c_xs_lengths_[I2];
+        const index_t input_spatial_acum = ck::accumulate_n<index_t>(
+            arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());

        // check device
        if(get_device_name() == "gfx908")
@@ -1310,7 +1430,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                     is_same_v<ALayout, ctc::GNHWC> || is_same_v<ALayout, ctc::GNDHWC> ||
                     is_same_v<ALayout, ctc::NWGC> || is_same_v<ALayout, ctc::NHWGC> ||
                     is_same_v<ALayout, ctc::NDHWGC> || is_same_v<ALayout, ctc::NGCW> ||
-                     is_same_v<ALayout, ctc::NGCHW> || is_same_v<ALayout, ctc::NGCDHW>)
+                     NeedTransposeKernel)
        {
            // Check access per C
            if(!(ABlockTransferSrcVectorDim == 2 && C % ABlockTransferSrcScalarPerVector == 0))
@@ -1326,6 +1446,23 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                }
            }
        }
+        else if constexpr(is_same_v<ALayout, ctc::NGCHW> || is_same_v<ALayout, ctc::NGCDHW>)
+        {
+            static_assert(NeedTransposeKernel == false);
+            static_assert(NumGroupsToMerge == 1);
+
+            if constexpr(ABlockTransferSrcScalarPerVector != 1)
+            {
+                if(ABlockTransferSrcVectorDim != 1)
+                {
+                    return false;
+                }
+                if(input_spatial_acum % ABlockTransferSrcScalarPerVector != 0)
+                {
+                    return false;
+                }
+            }
+        }
        else
        {
            return false;
@@ -1350,7 +1487,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
        {
            return false;
        }
-
        //  check vector access of Ds
        bool valid = true;

@@ -1396,8 +1532,7 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
            }
        });

-        if constexpr(is_NGCHW_NGKHW<ALayout, BLayout, ELayout>() ||
-                     is_NGCDHW_NGKDHW<ALayout, BLayout, ELayout>())
+        if constexpr(NeedTransposeKernel)
        {
            if((G * C) % CDEBlockTransferScalarPerVector_NPerBlock != 0)
            {
@@ -1409,8 +1544,6 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                return false;
            }

-            const index_t input_spatial_acum = ck::accumulate_n<index_t>(
-                arg.a_g_n_c_wis_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
            const index_t output_spatial_acum = ck::accumulate_n<index_t>(
                arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());

@@ -1457,9 +1590,22 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
                     is_same_v<ELayout, ctc::NDHWGK> || is_same_v<ELayout, ctc::NGKW> ||
                     is_same_v<ELayout, ctc::NGKHW> || is_same_v<ELayout, ctc::NGKDHW>)
        {
-            if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+            if(CTranspose == false)
            {
-                return false;
+                if(!(K % CDEBlockTransferScalarPerVector_NPerBlock == 0))
+                {
+                    return false;
+                }
+            }
+            else
+            {
+                const index_t output_spatial_acum = ck::accumulate_n<index_t>(
+                    arg.e_g_n_k_wos_lengths_.begin() + I3, NDimSpatial, 1, std::multiplies<>());
+
+                if(output_spatial_acum % CDEBlockTransferScalarPerVector_NPerBlock != 0)
+                {
+                    return false;
+                }
            }
        }
        else
@@ -1483,11 +1629,22 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
        }
        else
        {
-            return GridwiseGemm::CheckValidity(arg.a_grid_desc_m_k_,
-                                               arg.b_grid_desc_n_k_,
-                                               arg.ds_grid_desc_m_n_,
-                                               arg.e_grid_desc_m_n_,
-                                               arg.block_2_etile_map_);
+            if constexpr(CTranspose)
+            {
+                return GridwiseGemmCTranspose::CheckValidity(arg.b_grid_desc_n_k_,
+                                                             arg.a_grid_desc_m_k_,
+                                                             arg.ds_grid_desc_m_n_,
+                                                             arg.e_grid_desc_m_n_,
+                                                             arg.block_2_etile_map_);
+            }
+            else
+            {
+                return GridwiseGemmCTranspose::CheckValidity(arg.a_grid_desc_m_k_,
+                                                             arg.b_grid_desc_n_k_,
+                                                             arg.ds_grid_desc_m_n_,
+                                                             arg.e_grid_desc_m_n_,
+                                                             arg.block_2_etile_map_);
+            }
        }
    }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
@@ -1473,7 +1473,12 @@ struct GridwiseMoeGemm
                                        index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
                                        const index_t token_offset = fused_token & 0xffffff;
                                        return token_offset < problem.NumTokens
-                                                   ? p_sorted_weights_0[token_offset]
+                                                   ? p_sorted_weights_0[IsInputGemm
+                                                                            ? token_offset
+                                                                            : token_offset *
+                                                                                      problem.TopK +
+                                                                                  (fused_token >>
+                                                                                   24)]
                                                   : 0.0;
                                    }
                                    else
@@ -2190,7 +2195,12 @@ struct GridwiseMoeGemm
                                        index_t fused_token = scale_token_ids.AsType<index_t>()[m4];
                                        const index_t token_offset = fused_token & 0xffffff;
                                        return token_offset < problem.NumTokens
-                                                   ? p_sorted_weights_0[token_offset]
+                                                   ? p_sorted_weights_0[IsInputGemm
+                                                                            ? token_offset
+                                                                            : token_offset *
+                                                                                      problem.TopK +
+                                                                                  (fused_token >>
+                                                                                   24)]
                                                   : 0.0;
                                    }
                                    else
--- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp
@@ -19,7 +19,8 @@ template <index_t NDimSpatial,
          typename ADataType       = float,
          typename CDataType       = float,
          index_t NumGroupsToMerge = 1,
-          typename IndexType       = index_t>
+          typename IndexType       = index_t,
+          bool CTranspose          = false>
 struct TransformConvFwdToGemm
 {
    private:
@@ -1253,6 +1254,83 @@ struct TransformConvFwdToGemm
        }
    }

+    template <typename ALayout,
+              typename ck::enable_if<NDimSpatial == 1 &&
+                                         is_same_v<ALayout, tensor_layout::convolution::NGCW>,
+                                     bool>::type = false>
+    __host__ __device__ auto MakeADescriptor_M_K() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        static_assert(ConvForwardSpecialization ==
+                      device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0);
+
+        const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, Wo_, C_), make_tuple(NStrideTensorA_, I1, CStrideTensorA_));
+
+        return transform_tensor_descriptor(
+            in_gemmm_gemmk_desc,
+            make_tuple(make_merge_transform(make_tuple(N_, Wo_)), make_pass_through_transform(C_)),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename ALayout,
+              typename ck::enable_if<NDimSpatial == 2 &&
+                                         is_same_v<ALayout, tensor_layout::convolution::NGCHW>,
+                                     bool>::type = false>
+    __host__ __device__ auto MakeADescriptor_M_K() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        static_assert(ConvForwardSpecialization ==
+                      device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0);
+
+        const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, Ho_ * Wo_, C_), make_tuple(NStrideTensorA_, I1, CStrideTensorA_));
+
+        return transform_tensor_descriptor(
+            in_gemmm_gemmk_desc,
+            make_tuple(make_merge_transform(make_tuple(N_, Ho_ * Wo_)),
+                       make_pass_through_transform(C_)),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename ALayout,
+              typename ck::enable_if<NDimSpatial == 3 &&
+                                         is_same_v<ALayout, tensor_layout::convolution::NGCDHW>,
+                                     bool>::type = false>
+    __host__ __device__ auto MakeADescriptor_M_K() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        static_assert(ConvForwardSpecialization ==
+                      device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0);
+
+        const auto in_gemmm_gemmk_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, Do_ * Ho_ * Wo_, C_), make_tuple(NStrideTensorA_, I1, CStrideTensorA_));
+
+        return transform_tensor_descriptor(
+            in_gemmm_gemmk_desc,
+            make_tuple(make_merge_transform(make_tuple(N_, Do_ * Ho_ * Wo_)),
+                       make_pass_through_transform(C_)),
+            make_tuple(Sequence<0, 1>{}, Sequence<2>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
+    template <typename BLayout,
+              typename ck::enable_if<is_same_v<BLayout, tensor_layout::convolution::GKCX> ||
+                                         is_same_v<BLayout, tensor_layout::convolution::GKCYX> ||
+                                         is_same_v<BLayout, tensor_layout::convolution::GKCZYX>,
+                                     bool>::type = false>
+    __host__ __device__ auto MakeBDescriptor_N_K() const
+    {
+        static_assert(ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0 ||
+                      ConvForwardSpecialization ==
+                          device::ConvolutionForwardSpecialization::Filter1x1Pad0);
+        static_assert(NumGroupsToMerge == 1);
+        return make_naive_tensor_descriptor_packed(make_tuple(K_, C_));
+    }
+
    template <typename BLayout,
              typename ck::enable_if<is_same_v<BLayout, tensor_layout::convolution::GKXC> ||
                                         is_same_v<BLayout, tensor_layout::convolution::GKYXC> ||
@@ -1338,8 +1416,16 @@ struct TransformConvFwdToGemm
                               bool>::type = false>
    __host__ __device__ auto MakeCDescriptor_M_N() const
    {
-        return make_naive_tensor_descriptor(make_tuple(N_ * Wo_, K_),
-                                            make_tuple(I0, KStrideTensorC_));
+        if constexpr(CTranspose)
+        {
+            return make_naive_tensor_descriptor(make_tuple(K_, N_ * Wo_),
+                                                make_tuple(KStrideTensorC_, I0));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(N_ * Wo_, K_),
+                                                make_tuple(I0, KStrideTensorC_));
+        }
    }

    template <
@@ -1350,8 +1436,16 @@ struct TransformConvFwdToGemm
                               bool>::type = false>
    __host__ __device__ auto MakeCDescriptor_M_N() const
    {
-        return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_),
-                                            make_tuple(I0, KStrideTensorC_));
+        if constexpr(CTranspose)
+        {
+            return make_naive_tensor_descriptor(make_tuple(K_, N_ * Ho_ * Wo_),
+                                                make_tuple(KStrideTensorC_, I0));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(N_ * Ho_ * Wo_, K_),
+                                                make_tuple(I0, KStrideTensorC_));
+        }
    }

    template <
@@ -1362,12 +1456,21 @@ struct TransformConvFwdToGemm
                               bool>::type = false>
    __host__ __device__ auto MakeCDescriptor_M_N() const
    {
-        return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_),
-                                            make_tuple(I0, KStrideTensorC_));
+        if constexpr(CTranspose)
+        {
+            return make_naive_tensor_descriptor(make_tuple(K_, N_ * Do_ * Ho_ * Wo_),
+                                                make_tuple(KStrideTensorC_, I0));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(make_tuple(N_ * Do_ * Ho_ * Wo_, K_),
+                                                make_tuple(I0, KStrideTensorC_));
+        }
    }

    template <typename CLayout,
-              index_t NDimSp                     = NDimSpatial,
+              index_t NDimSp = NDimSpatial,
+
              typename ck::enable_if<NDimSp == 1 &&
                                         (is_same_v<CLayout, tensor_layout::convolution::G_NW_K> ||
                                          is_same_v<CLayout, tensor_layout::convolution::NWGK> ||
@@ -1375,6 +1478,7 @@ struct TransformConvFwdToGemm
                                     bool>::type = false>
    __host__ __device__ auto MakeCDescriptor_M_N() const
    {
+        static_assert(CTranspose == false);
        const IndexType NDoHoWo = N_ * Wo_;
        if constexpr(NumGroupsToMerge == 1)
        {
@@ -1429,6 +1533,7 @@ struct TransformConvFwdToGemm
                                     bool>::type = false>
    __host__ __device__ auto MakeCDescriptor_M_N() const
    {
+        static_assert(CTranspose == false);
        const IndexType NDoHoWo = N_ * Ho_ * Wo_;
        if constexpr(NumGroupsToMerge == 1)
        {
@@ -1486,7 +1591,7 @@ struct TransformConvFwdToGemm
                  bool>::type = false>
    __host__ __device__ auto MakeCDescriptor_M_N() const
    {
-
+        static_assert(CTranspose == false);
        const IndexType NDoHoWo = N_ * Do_ * Ho_ * Wo_;
        if constexpr(NumGroupsToMerge == 1)
        {
@@ -1536,6 +1641,101 @@ struct TransformConvFwdToGemm
        }
    }

+    template <typename CLayout,
+              index_t NDimSp = NDimSpatial,
+
+              typename ck::enable_if<NDimSp == 1 &&
+                                         (is_same_v<CLayout, tensor_layout::convolution::GNKW> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::NGKW>),
+                                     bool>::type = false>
+    __host__ __device__ auto MakeCDescriptor_M_N() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        auto n_k_wo_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, K_, Wo_), make_tuple(NStrideTensorC_, KStrideTensorC_, I1));
+        if constexpr(CTranspose)
+        {
+            return transform_tensor_descriptor(
+                n_k_wo_desc,
+                make_tuple(make_pass_through_transform(K_),
+                           make_merge_transform(make_tuple(N_, Wo_))),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(n_k_wo_desc,
+                                               make_tuple(make_merge_transform(make_tuple(N_, Wo_)),
+                                                          make_pass_through_transform(K_)),
+                                               make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    template <typename CLayout,
+              index_t NDimSp = NDimSpatial,
+
+              typename ck::enable_if<NDimSp == 2 &&
+                                         (is_same_v<CLayout, tensor_layout::convolution::GNKHW> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::NGKHW>),
+                                     bool>::type = false>
+    __host__ __device__ auto MakeCDescriptor_M_N() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        auto n_k_howo_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, K_, Ho_ * Wo_), make_tuple(NStrideTensorC_, KStrideTensorC_, I1));
+        if constexpr(CTranspose)
+        {
+            return transform_tensor_descriptor(
+                n_k_howo_desc,
+                make_tuple(make_pass_through_transform(K_),
+                           make_merge_transform(make_tuple(N_, Ho_ * Wo_))),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                n_k_howo_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Ho_ * Wo_)),
+                           make_pass_through_transform(K_)),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
+
+    template <typename CLayout,
+              index_t NDimSp = NDimSpatial,
+
+              typename ck::enable_if<NDimSp == 3 &&
+                                         (is_same_v<CLayout, tensor_layout::convolution::GNKDHW> ||
+                                          is_same_v<CLayout, tensor_layout::convolution::NGKDHW>),
+                                     bool>::type = false>
+    __host__ __device__ auto MakeCDescriptor_M_N() const
+    {
+        static_assert(NumGroupsToMerge == 1);
+        auto n_k_dohowo_desc = make_naive_tensor_descriptor(
+            make_tuple(N_, K_, Do_ * Ho_ * Wo_), make_tuple(NStrideTensorC_, KStrideTensorC_, I1));
+
+        if constexpr(CTranspose)
+        {
+            return transform_tensor_descriptor(
+                n_k_dohowo_desc,
+                make_tuple(make_pass_through_transform(K_),
+                           make_merge_transform(make_tuple(N_, Do_ * Ho_ * Wo_))),
+                make_tuple(Sequence<1>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+        else
+        {
+            return transform_tensor_descriptor(
+                n_k_dohowo_desc,
+                make_tuple(make_merge_transform(make_tuple(N_, Do_ * Ho_ * Wo_)),
+                           make_pass_through_transform(K_)),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+        }
+    }
    IndexType N_;
    IndexType Di_, Hi_, Wi_;
    IndexType Do_, Ho_, Wo_;
--- a/include/ck/utility/amd_ck_fp8.hpp
+++ b/include/ck/utility/amd_ck_fp8.hpp
@@ -5,19 +5,16 @@

 #include "ck/ck.hpp"
 #include "ck/utility/enable_if.hpp"
+#include "ck/utility/get_id.hpp"
 #include "ck/utility/random_gen.hpp"
 #include "ck/utility/functional.hpp"
 #include "ck/utility/type.hpp"

-#ifdef CK_USE_FNUZ_FP8
-#define CK_USE_FNUZ_FP8 1
-#else
+#ifndef CK_USE_FNUZ_FP8
 #define CK_USE_FNUZ_FP8 0
 #endif

-#ifdef CK_USE_OCP_FP8
-#define CK_USE_OCP_FP8 1
-#else
+#ifndef CK_USE_OCP_FP8
 #define CK_USE_OCP_FP8 0
 #endif

@@ -431,7 +428,7 @@ __host__ __device__ inline constexpr bool fp8_is_inf(bf8_ocp_t a)
 namespace fp8_impl {

 // Assertions to check for supported conversion types
-#define __assert_ocp_support(interp)                                               \
+#define __fp8_impl_assert_ocp_support(interp)                                      \
    {                                                                              \
        if(interp != ck_fp8_interpretation_t::CK_E4M3_OCP &&                       \
           interp != ck_fp8_interpretation_t::CK_E5M2_OCP)                         \
@@ -439,7 +436,7 @@ namespace fp8_impl {
            __hip_assert(false && "type is unsupported by current target device"); \
        }                                                                          \
    }
-#define __assert_fnuz_support(interp)                                              \
+#define __fp8_impl_assert_fnuz_support(interp)                                     \
    {                                                                              \
        if(interp != ck_fp8_interpretation_t::CK_E4M3_FNUZ &&                      \
           interp != ck_fp8_interpretation_t::CK_E5M2_FNUZ)                        \
@@ -453,10 +450,10 @@ __is_interpret_supported([[maybe_unused]] ck_fp8_interpretation_t interp)
 {
 #if defined(__HIP_DEVICE_COMPILE__) && __HIP_DEVICE_COMPILE__
 #if CK_USE_OCP_FP8
-    __assert_ocp_support(interp);
+    __fp8_impl_assert_ocp_support(interp);
 #endif
 #if CK_USE_FNUZ_FP8
-    __assert_fnuz_support(interp);
+    __fp8_impl_assert_fnuz_support(interp);
 #endif
 #endif
 }
@@ -1396,12 +1393,18 @@ __host__ __device__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
    uint32_t rng = 0;
    if constexpr(stochastic_rounding)
    {
+#if defined(__gfx950__)
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
+#else
        constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
 #else
        rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
    }
    return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
        f, rng);
@@ -1416,12 +1419,18 @@ __host__ static inline fp8_storage_t cvt_float_to_fp8(const float f)
    uint32_t rng = 0;
    if constexpr(stochastic_rounding)
    {
+#if defined(__gfx950__)
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
+#else
        constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
 #else
        rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
    }

    if constexpr(interp == ck_fp8_interpretation_t::CK_E4M3_FNUZ)
@@ -1487,12 +1496,18 @@ __device__ static inline fp8x2_storage_t cvt_float_to_fp8(const float2_t f)
    uint32_t rng = 0;
    if constexpr(stochastic_rounding)
    {
+#if defined(__gfx950__)
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
+#else
        constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-        rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
+        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
 #else
        rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&f), f[0]);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
    }
    return cast_to_f8_from_f32<interp, sat == ck_saturation_t::CK_SATFINITE, stochastic_rounding>(
        f, rng);
@@ -1532,12 +1547,18 @@ __host__ static inline fp8_storage_t cvt_half_t_to_fp8(const _Float16 x)
        uint32_t rng = 0;
        if constexpr(stochastic_rounding)
        {
+#if defined(__gfx950__)
+            // use HW clock for stochastic input multiply by incremented thread id
+            rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                            (get_thread_global_1d_id() + 1));
+#else
            constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
        }
 #if defined(__gfx950__)
        return cast_to_f8_from_f16<interp,
@@ -1574,12 +1595,18 @@ __host__ static inline fp8x2_storage_t cvt_half_t_to_fp8(const half2_t x)
        uint32_t rng = 0;
        if constexpr(stochastic_rounding)
        {
+#if defined(__gfx950__)
+            // use HW clock for stochastic input multiply by incremented thread id
+            rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                            (get_thread_global_1d_id() + 1));
+#else
            constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
 #else
            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
        }
 #if defined(__gfx950__)
        return cast_to_f8_from_f16<interp,
@@ -1616,13 +1643,19 @@ __host__ static inline fp8_storage_t cvt_bhalf_t_to_fp8(const ushort x)
        uint32_t rng = 0;
        if constexpr(stochastic_rounding)
        {
+#if defined(__gfx950__)
+            // use HW clock for stochastic input multiply by incremented thread id
+            rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                            (get_thread_global_1d_id() + 1));
+#else
            constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
                                               static_cast<float>(x));
 #else
            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), static_cast<float>(x));
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
        }
 #if defined(__gfx950__)
        return cast_to_f8_from_bf16<interp,
@@ -1664,14 +1697,20 @@ __host__ static inline fp8x2_storage_t cvt_bhalf_t_to_fp8(const ushortx2_t x)
        uint32_t rng = 0;
        if constexpr(stochastic_rounding)
        {
+#if defined(__gfx950__)
+            // use HW clock for stochastic input multiply by incremented thread id
+            rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                            (get_thread_global_1d_id() + 1));
+#else
            constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-            rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
+            rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x),
                                               static_cast<float>(x[0]));
 #else
            rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x),
                                               static_cast<float>(x[0]));
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
        }
 #if defined(__gfx950__)
        return cast_to_f8_from_bf16<interp,
--- a/include/ck/utility/filter_tuple.hpp
+++ b/include/ck/utility/filter_tuple.hpp
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+#include "ck/utility/functional.hpp"
+#include "ck/utility/sequence.hpp"
+
+namespace ck::util {
+
+template <typename Tuple, std::size_t Stride, std::size_t Offset>
+struct filter_tuple_by_modulo
+{
+    // Validate Stride and Offset.
+    static_assert(Stride > 0, "Offset must be positive.");
+    static_assert(Offset >= 0 && Offset < Stride,
+                  "Offset must be positive and less than the stride.");
+
+    // Generate filtered indices for this stride and offset.
+    static constexpr int new_size = (std::tuple_size_v<Tuple> + Stride - Offset - 1) / Stride;
+
+    template <std::size_t... Is>
+    static constexpr auto to_index(std::index_sequence<Is...>)
+    {
+        return std::index_sequence<(Offset + Is * Stride)...>{};
+    }
+
+    using filtered_indices = decltype(to_index(std::make_index_sequence<new_size>{}));
+
+    // Helper struct to construct the new tuple type from the filtered indices.
+    template <typename T, typename Indices>
+    struct make_filtered_tuple_type_impl;
+
+    template <typename T, std::size_t... Is>
+    struct make_filtered_tuple_type_impl<T, std::index_sequence<Is...>>
+    {
+        using type = std::tuple<std::tuple_element_t<Is, T>...>;
+    };
+
+    using type = typename make_filtered_tuple_type_impl<Tuple, filtered_indices>::type;
+};
+
+// Filter a tuple with a stride and offset.
+//
+// Tuple is a std::tuple or equivalent
+// Stride is a positive integer
+// Offset is a positive integer smaller than ofset
+//
+// Evaluates to a smaller tuple type from elements of T with stride M and offset I.
+//
+// Can be used to filter a tuple of types for sharded instantiations.
+template <typename Tuple, std::size_t Stride, std::size_t Offset>
+using filter_tuple_by_modulo_t = typename filter_tuple_by_modulo<Tuple, Stride, Offset>::type;
+
+// Example compile-time test:
+// using OriginalTuple =
+//    std::tuple<int, double, char, float, long, short, bool, char, long long, unsigned int>;
+// using NewTuple_Every3rdFrom2nd = filter_tuple_by_modulo_t<OriginalTuple, 3, 1>;
+// static_assert(std::is_same_v<NewTuple_Every3rdFrom2nd, std::tuple<double, long, char>>,
+//               "Test Case 1 Failed: Every 3rd from 2nd");
+
+} // namespace ck::util
--- a/include/ck/utility/mxf8_utils.hpp
+++ b/include/ck/utility/mxf8_utils.hpp
@@ -197,8 +197,9 @@ __host__ __device__ static inline fp8_storage_t cvt_float_to_fp8_scaled(const fl
    uint32_t rng = 0;
    if constexpr(stochastic_rounding)
    {
-        constexpr int seed = 1254739;
-        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f);
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
    }
    return cast_to_f8_from_f32_scaled<interp, stochastic_rounding>(f, rng, scale);
 }
@@ -221,8 +222,9 @@ __host__ __device__ static inline fp8x2_storage_t cvt_float_to_fp8_scaled(const
    uint32_t rng = 0;
    if constexpr(stochastic_rounding)
    {
-        constexpr int seed = 1254739;
-        rng                = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&f), f[0]);
+        // use HW clock for stochastic input multiply by incremented thread id
+        rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                        (get_thread_global_1d_id() + 1));
    }
    return cast_to_f8_from_f32_scaled<interp, stochastic_rounding>(f, rng, scale);
 }
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -5,6 +5,7 @@

 #include "ck/utility/data_type.hpp"
 #include "ck/utility/f8_utils.hpp"
+#include "ck/utility/get_id.hpp"
 #include "ck/utility/mxf4_utils.hpp"
 #include "ck/utility/mxf6_utils.hpp"
 #include "ck/utility/random_gen.hpp"
@@ -234,12 +235,18 @@ __host__ __device__ constexpr Y f8_convert_sr(X x);
 template <>
 inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, float>(float x)
 {
+#if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
+#else
    constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
 #if defined(__gfx94__)
    union
    {
@@ -296,12 +303,18 @@ inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, half_t>(half_t x)
 template <>
 inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, float>(float x)
 {
+#if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
+#else
    constexpr int seed = 1254739;
 #ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
+#endif // #ifndef CK_CODE_GEN_RTC
+#endif // #if defined(__gfx950__)
 #if defined(__gfx94__)
    union
    {
@@ -1446,13 +1459,10 @@ inline __host__ __device__ f4x32_t f4_convert_rne(float32_t x, float scale = 1.0
 // convert fp32 to fp4 with stochastic rounding
 inline __host__ __device__ f4_t f4_convert_sr(float x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
    union
    {
        uint32_t bitwise;
@@ -1468,6 +1478,12 @@ inline __host__ __device__ f4_t f4_convert_sr(float x, float scale = 1.0f)
        value.bitwise, float_values.float2_array, rng, scale, 0);
    return value.f4_array[0];
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
    return utils::sat_convert_to_type_sr<f4_t>(x / scale, rng);
 #endif
 }
@@ -1475,30 +1491,26 @@ inline __host__ __device__ f4_t f4_convert_sr(float x, float scale = 1.0f)
 // convert vector of 2 fp32 to vector of 2 fp4 with sr
 inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
    union
    {
        uint32_t bitwise;
        f4x2_t f4x2_array[4];
    } value{0};
-// apply a temporary workaround for gfx950
-#if CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION
-    uint8_t l     = utils::sat_convert_to_type_sr<f4_t>(x[1] / scale, rng);
-    uint8_t h     = utils::sat_convert_to_type_sr<f4_t>(x[0] / scale, rng);
-    value.bitwise = (h << 4) | l;
-#else
    // permute high bits and low bits to match the order of the original vector
    value.bitwise = __builtin_amdgcn_cvt_scalef32_sr_pk_fp4_f32(
        value.bitwise, float2_t{x[1], x[0]}, rng, scale, 0);
-#endif // CK_WORKAROUND_FP32_TO_FP4_SR_CONVERSION
    return value.f4x2_array[0];
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
+#endif
    union
    {
        uint32_t bitwise;
@@ -1514,13 +1526,10 @@ inline __host__ __device__ f4x2_t f4_convert_sr(float2_t x, float scale = 1.0f)
 // convert vector of 32 fp32 to vector of 32 fp4 with sr
 inline __host__ __device__ f4x32_t f4_convert_sr(float32_t x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
    union
    {
        __uint128_t bitwise;
@@ -1546,6 +1555,12 @@ inline __host__ __device__ f4x32_t f4_convert_sr(float32_t x, float scale = 1.0f

    return f4_values.f4x32_array;
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x[0]);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x[0]);
+#endif
    union
    {
        __uint128_t bitwise;
@@ -1776,13 +1791,10 @@ inline __host__ __device__ f6x32_t f6_convert_rne(float32_t x, float scale = 1.0
 */
 inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
    union
    {
        float32_t float_vector;
@@ -1799,6 +1811,12 @@ inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)

    return out.f6_array[0];
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
    return utils::sat_convert_to_type_sr<f6_t>(x / scale, rng);
 #endif
 }
@@ -1815,6 +1833,12 @@ inline __host__ __device__ f6_t f6_convert_sr(float x, float scale = 1.0f)
 */
 inline __host__ __device__ f6x32_t f6_convert_sr(float32_t x, float scale = 1.0f)
 {
+#if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
+    return __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(x, rng, scale);
+#else
    constexpr int seed = 1254739;
    union
    {
@@ -1828,9 +1852,6 @@ inline __host__ __device__ f6x32_t f6_convert_sr(float32_t x, float scale = 1.0f
    uint32_t rng =
        prand_generator<float, seed>(reinterpret_cast<size_t>(&x), float_values.float_array[0]);
 #endif
-#if defined(__gfx950__)
-    return __builtin_amdgcn_cvt_scalef32_sr_pk32_fp6_f32(x, rng, scale);
-#else
    union
    {
        float32_t float_vector;
@@ -2044,13 +2065,10 @@ inline __host__ __device__ bf6x32_t bf6_convert_rne(float32_t x, float scale = 1
 */
 inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)
 {
-    constexpr int seed = 1254739;
-#ifndef CK_CODE_GEN_RTC
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
-#else
-    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
-#endif
 #if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
    union
    {
        float32_t float_vector;
@@ -2067,6 +2085,12 @@ inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)

    return out.bf6_array[0];
 #else
+    constexpr int seed = 1254739;
+#ifndef CK_CODE_GEN_RTC
+    uint32_t rng       = prand_generator<float, seed>(reinterpret_cast<uintptr_t>(&x), x);
+#else
+    uint32_t rng = prand_generator<float, seed>(reinterpret_cast<size_t>(&x), x);
+#endif
    return utils::sat_convert_to_type_sr<bf6_t>(x / scale, rng);
 #endif
 }
@@ -2085,6 +2109,12 @@ inline __host__ __device__ bf6_t bf6_convert_sr(float x, float scale = 1.0f)
 */
 inline __host__ __device__ bf6x32_t bf6_convert_sr(float32_t x, float scale = 1.0f)
 {
+#if defined(__gfx950__)
+    // use HW clock for stochastic input multiply by incremented thread id
+    uint32_t rng = __builtin_amdgcn_prng_b32(__builtin_amdgcn_s_memrealtime() *
+                                             (get_thread_global_1d_id() + 1));
+    return __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(x, rng, scale);
+#else
    constexpr int seed = 1254739;
    union
    {
@@ -2098,9 +2128,6 @@ inline __host__ __device__ bf6x32_t bf6_convert_sr(float32_t x, float scale = 1.
    uint32_t rng =
        prand_generator<float, seed>(reinterpret_cast<size_t>(&x), float_values.float_array[0]);
 #endif
-#if defined(__gfx950__)
-    return __builtin_amdgcn_cvt_scalef32_sr_pk32_bf6_f32(x, rng, scale);
-#else
    union
    {
        float32_t float_vector;
--- a/include/ck_tile/core/numeric/integer.hpp
+++ b/include/ck_tile/core/numeric/integer.hpp
@@ -7,6 +7,7 @@
 namespace ck_tile {

 using index_t      = int32_t;
+using int32_t      = int32_t;
 using long_index_t = int64_t;
 using int8_t       = int8_t;

--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -1009,6 +1009,15 @@ struct buffer_view<address_space_enum::lds,
                         std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
                        (std::is_same_v<remove_cvref_t<T>, int8x16_t> &&
                         std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                        // int8 on thread buffer
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 8>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 4>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 2>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 1>>) ||
                        // ext_vector_type for pk_int4 must use int8_t as type
                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>) ||
@@ -1031,6 +1040,8 @@ struct buffer_view<address_space_enum::lds,

                if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                              std::is_same_v<remove_cvref_t<X>, int8_t>) ||
+                             (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                              std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 1>>) ||
                             (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                              std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>))
                {
@@ -1041,6 +1052,8 @@ struct buffer_view<address_space_enum::lds,
                }
                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                                   std::is_same_v<remove_cvref_t<X>, int8x2_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 2>>) ||
                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 2>>))
                {
@@ -1051,6 +1064,8 @@ struct buffer_view<address_space_enum::lds,
                }
                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                                   std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 4>>) ||
                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>))
                {
@@ -1061,6 +1076,8 @@ struct buffer_view<address_space_enum::lds,
                }
                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
                                   std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<int8_t, 8>>) ||
                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>))
                {
--- a/include/ck_tile/core/tensor/shuffle_tile.hpp
+++ b/include/ck_tile/core/tensor/shuffle_tile.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -129,7 +129,10 @@ CK_TILE_DEVICE void shuffle_tile_impl_in_thread(OutTensor& out_tensor, const InT
        // set output vectors
        static_for<0, num_vec_out, 1>{}([&](auto i) {
            constexpr auto idx_y_out_tmp = generate_array(
-                [&](auto ii) { return ii == y_dim_vec_in ? idx_y_start[ii] + i : idx_y_start[ii]; },
+                [&](auto ii) {
+                    return ii == y_dim_vec_in ? static_cast<index_t>(idx_y_start[ii]) + i
+                                              : static_cast<index_t>(idx_y_start[ii]);
+                },
                number<NDimY>{});

            constexpr auto idx_y_out =
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -314,8 +314,7 @@ struct tile_window_linear

        constexpr auto tile_dstr = typename Base::TileDstr{};

-        auto dst_tensor =
-            make_static_distributed_tensor<typename Base::DataTypeDataType>(tile_dstr);
+        auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);

        auto issue = [&](auto i_access_) {
            constexpr auto IAccess = number<i_access_>{};
@@ -348,8 +347,9 @@ struct tile_window_linear
                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                                      Base::Traits::PackedSize;

-                dst_tensor.get_thread_buffer().template at<d>() = vec_value.template get_as<
-                    typename Base::DataTypeDataType>()[j / Base::Traits::PackedSize];
+                dst_tensor.get_thread_buffer().template at<d>() =
+                    vec_value
+                        .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
            });
        };

@@ -400,8 +400,9 @@ struct tile_window_linear
                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                                      Base::Traits::PackedSize;

-                dst_tensor.get_thread_buffer().template at<d>() = vec_value.template get_as<
-                    typename Base::DataTypeDataType>()[j / Base::Traits::PackedSize];
+                dst_tensor.get_thread_buffer().template at<d>() =
+                    vec_value
+                        .template get_as<typename Base::DataType>()[j / Base::Traits::PackedSize];
            });
        };

@@ -805,8 +806,7 @@ struct tile_window_linear
                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                                      Base::Traits::PackedSize;

-                vec_value.template get_as<typename Base::DataTypeDataType>()(
-                    j / Base::Traits::PackedSize) =
+                vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
                    dstr_tensor.get_thread_buffer().template at<d>();
            });

@@ -861,8 +861,7 @@ struct tile_window_linear
                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
                                      Base::Traits::PackedSize;

-                vec_value.template get_as<typename Base::DataTypeDataType>()(
-                    j / Base::Traits::PackedSize) =
+                vec_value.template get_as<typename Base::DataType>()(j / Base::Traits::PackedSize) =
                    dstr_tensor.get_thread_buffer().template at<d>();
            });

--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -230,7 +230,7 @@ struct HostTensorDescriptor
     * @param iss Vector containing the multi-dimensional indices
     * @return The calculated linear offset as a size_t
     */
-    std::size_t GetOffsetFromMultiIndex(std::vector<std::size_t> iss) const
+    std::size_t GetOffsetFromMultiIndex(const std::vector<std::size_t>& iss) const
    {
        return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0});
    }
@@ -540,9 +540,12 @@ struct HostTensor
        return mData[GetOffsetFromMultiIndex(is...)];
    }

-    T& operator()(std::vector<std::size_t> idx) { return mData[GetOffsetFromMultiIndex(idx)]; }
+    T& operator()(const std::vector<std::size_t>& idx)
+    {
+        return mData[GetOffsetFromMultiIndex(idx)];
+    }

-    const T& operator()(std::vector<std::size_t> idx) const
+    const T& operator()(const std::vector<std::size_t>& idx) const
    {
        return mData[GetOffsetFromMultiIndex(idx)];
    }
@@ -719,6 +722,8 @@ struct HostTensor
                    file << type_convert<float>(itm) << std::endl;
                else if(dtype == "int")
                    file << type_convert<int>(itm) << std::endl;
+                else if(dtype == "int8_t")
+                    file << static_cast<int>(type_convert<ck_tile::int8_t>(itm)) << std::endl;
                else
                    // TODO: we didn't implement operator<< for all custom
                    // data types, here fall back to float in case compile error
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -75,7 +75,6 @@ struct FlatmmPipelineAGmemBGmemCRegV1

    CK_TILE_HOST_DEVICE static constexpr auto HotLoopScheduler()
    {
-#if defined(USING_MFMA_16x16x32) || defined(USING_MFMA_32x32x16)
        constexpr auto config = BlockFlatmm::BlockPolicy::template GetWarpGemmMWarpNWarp<Problem>();

        using WG = remove_cvref_t<decltype(config.template at<0>())>;
@@ -91,64 +90,68 @@ struct FlatmmPipelineAGmemBGmemCRegV1
        constexpr index_t A_Buffer_Load_Inst_Num = kMPerBlock * kKPerBlock / BlockSize / KPerLoad;
        constexpr index_t A_LDS_Read_Inst_Num    = MIterPerWarp * KIterPerWarp;
        constexpr index_t B_Buffer_Load_Inst_Num = NIterPerWarp * KIterPerWarp;
-#endif
-#if defined(USING_MFMA_16x16x32)
-        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
-        });
-        static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
-        });
-        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-            __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
-        });

-#elif defined(USING_MFMA_32x32x16)
-        static_for<0,
-                   A_LDS_Read_Inst_Num / 2 - A_Buffer_Load_Inst_Num - B_Buffer_Load_Inst_Num,
-                   1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, A_LDS_Read_Inst_Num / 2, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
-            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-        });
-        static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
-            ignore = i;
-            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
-            __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
-        });
-        __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
-#endif
+        if constexpr(WG::kM == 16 && WG::kN == 16)
+        {
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_LDS_Read_Inst_Num - A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+            });
+            static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 2, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+            });
+        }
+        else if constexpr(WG::kM == 32 && WG::kN == 32 &&
+                          (A_LDS_Read_Inst_Num / 2 >
+                           A_Buffer_Load_Inst_Num + B_Buffer_Load_Inst_Num))
+        {
+            static_for<0,
+                       A_LDS_Read_Inst_Num / 2 - A_Buffer_Load_Inst_Num - B_Buffer_Load_Inst_Num,
+                       1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_LDS_Read_Inst_Num / 2, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, B_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            });
+            static_for<0, A_Buffer_Load_Inst_Num, 1>{}([&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 3, 0); // MFMA
+            });
+            __builtin_amdgcn_sched_group_barrier(0x008, 4, 0); // MFMA
+        }
    }

    template <typename ADramBlockWindowTmp, typename BFlatBlockWindowTmp, typename AElementFunction>
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -19,55 +19,61 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
    {
        using namespace ck_tile;
-#if defined(USING_MFMA_16x16x32)
-        /*reduce transform layers,compare with old ck*/
-        constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t KPack     = GetSmemPackA<Problem>();

-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
-            make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
-            number<KPack>{},
-            number<1>{});
+        constexpr index_t MPerXdl = Problem::BlockGemmShape::WarpTile::at(I0);
+        constexpr index_t NPerXdl = Problem::BlockGemmShape::WarpTile::at(I1);
+        if constexpr(MPerXdl == 16 && NPerXdl == 16)
+        {
+            /*reduce transform layers,compare with old ck*/
+            constexpr index_t MPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t KPerBlock = Problem::BlockGemmShape::kK;
+            constexpr index_t KPack     = GetSmemPackA<Problem>();

-        constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(
-                make_xor_transform(make_tuple(number<MPerBlock>{}, number<KPerBlock / KPack>{})),
-                make_pass_through_transform(number<KPack>{})),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}),
-            make_tuple(sequence<1, 0>{}, sequence<2>{}));
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<KPerBlock / KPack>{}, number<MPerBlock>{}, number<KPack>{}),
+                make_tuple(number<KPack>{}, number<KPerBlock>{}, number<1>{}),
+                number<KPack>{},
+                number<1>{});

-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_permuted,
-            make_tuple(make_pass_through_transform(number<MPerBlock>{}),
-                       make_merge_transform_v3_division_mod(
-                           make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
+            constexpr auto a_lds_block_desc_permuted = transform_tensor_descriptor(
+                a_lds_block_desc_0,
+                make_tuple(make_xor_transform(
+                               make_tuple(number<MPerBlock>{}, number<KPerBlock / KPack>{})),
+                           make_pass_through_transform(number<KPack>{})),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}),
+                make_tuple(sequence<1, 0>{}, sequence<2>{}));

-        return a_lds_block_desc;
-#elif defined(USING_MFMA_32x32x16)
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-        constexpr index_t kKPack     = GetSmemPackA<Problem>();
+            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+                a_lds_block_desc_permuted,
+                make_tuple(make_pass_through_transform(number<MPerBlock>{}),
+                           make_merge_transform_v3_division_mod(
+                               make_tuple(number<KPerBlock / KPack>{}, number<KPack>{}))),
+                make_tuple(sequence<1>{}, sequence<0, 2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));

-        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
-            make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
-            make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
-            number<kKPack>{},
-            number<1>{});
+            return a_lds_block_desc;
+        }
+        else
+        {
+            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
+            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
+            constexpr index_t kKPack     = GetSmemPackA<Problem>();

-        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
-            a_lds_block_desc_0,
-            make_tuple(make_pass_through_transform(kMPerBlock),
-                       make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
-            make_tuple(sequence<1>{}, sequence<0, 2>{}),
-            make_tuple(sequence<0>{}, sequence<1>{}));
+            constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+                make_tuple(number<kKPerBlock / kKPack>{}, number<kMPerBlock>{}, number<kKPack>{}),
+                make_tuple(number<(kMPerBlock + 1) * kKPack>{}, number<kKPack>{}, number<1>{}),
+                number<kKPack>{},
+                number<1>{});

-        return a_lds_block_desc;
-#endif
+            constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+                a_lds_block_desc_0,
+                make_tuple(make_pass_through_transform(kMPerBlock),
+                           make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
+                make_tuple(sequence<1>{}, sequence<0, 2>{}),
+                make_tuple(sequence<0>{}, sequence<1>{}));
+
+            return a_lds_block_desc;
+        }
 /*xor*/
 #if 0
        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
@@ -138,6 +144,21 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
        return Problem::VectorLoadSize / sizeof(typename Problem::ADataType);
    }

+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetKBPerLoad()
+    {
+        using TileShape = typename Problem::BlockGemmShape;
+        if constexpr(TileShape::WarpTile::at(TileShape::idxN) == 32)
+        {
+            return TileShape::WarpTile::at(TileShape::idxK) / 2;
+        }
+        else
+        {
+            static_assert(TileShape::WarpTile::at(TileShape::idxN) == 16);
+            return TileShape::WarpTile::at(TileShape::idxK) / 4;
+        }
+    }
+
    template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
    {
@@ -189,7 +210,7 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
        }
        else
        {
-            constexpr index_t K1 = 16 / sizeof(ADataType);
+            constexpr index_t K1 = Problem::VectorLoadSize / sizeof(ADataType);
            constexpr index_t K0 = KPerBlock / K1;
            constexpr index_t M2 = get_warp_size() / K0;
            // coalesce reading for each blocks
@@ -232,19 +253,17 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
    template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr auto MakeBFlatDramTileDistribution()
    {
-        using BDataType = remove_cvref_t<typename Problem::BDataType>;
-
        using TileShape = typename Problem::BlockGemmShape; // ck_tile::TileFlatmmShape

        constexpr index_t BlockSize = Problem::kBlockSize;
        constexpr index_t WaveSize  = get_warp_size();
        constexpr index_t WaveNum   = BlockSize / WaveSize;

-        constexpr index_t KBPerLoad =
-            Problem::VectorLoadSize / sizeof(BDataType); // dwordx4 load B elem cnt
-        constexpr index_t KThdPerWave = WaveSize;        // threads cnt in K dim
+        constexpr index_t KBPerLoad   = GetKBPerLoad<Problem>();
+        constexpr index_t KThdPerWave = WaveSize; // threads cnt in K dim
        constexpr index_t KWavePerBlk = 1;
        constexpr index_t KRepeat     = 1;
+        static_assert(TileShape::flatKPerWarp == KThdPerWave * KBPerLoad, "wrong");

        constexpr index_t NBPerLoad   = 1;
        constexpr index_t NThdPerWave = 1;
--- a/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_batch_prefill_kernel.hpp
@@ -316,56 +316,56 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel

    template <bool Cond = !kIsGroupMode>
    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargsImpl(const void* q_ptr,
-                  const void* k_ptr,
-                  const void* v_ptr,
-                  const void* bias_ptr,
-                  void* rand_val_ptr,
-                  void* lse_ptr,
-                  void* o_ptr,
-                  ck_tile::index_t seqlen_q,
-                  ck_tile::index_t hdim_q,
-                  ck_tile::index_t hdim_v,
-                  ck_tile::index_t num_head_q,
-                  ck_tile::index_t nhead_ratio_qk,
-                  int32_t num_total_pages,
-                  const void* kv_indptr,
-                  const void* kv_page_indices,
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* rand_val_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              ck_tile::index_t seqlen_q,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              int32_t num_total_pages,
+              const void* kv_indptr,
+              const void* kv_page_indices,
 #if 0 // we assume page_block_size=1 for now
              const void* kv_last_page_lens,
              ck_tile::index_t page_block_size,
 #endif
-                  float scale_s,
-                  float scale_p,
-                  float scale_o,
-                  float logits_soft_cap,
-                  ck_tile::index_t stride_q,
-                  ck_tile::index_t stride_k,
-                  ck_tile::index_t stride_v,
-                  ck_tile::index_t stride_bias,
-                  ck_tile::index_t stride_randval,
-                  ck_tile::index_t stride_o,
-                  ck_tile::index_t nhead_stride_q,
-                  ck_tile::index_t nhead_stride_k,
-                  ck_tile::index_t nhead_stride_v,
-                  ck_tile::index_t nhead_stride_bias,
-                  ck_tile::index_t nhead_stride_randval,
-                  ck_tile::index_t nhead_stride_lse,
-                  ck_tile::index_t nhead_stride_o,
-                  ck_tile::index_t batch_stride_q,
-                  ck_tile::index_t batch_stride_k,
-                  ck_tile::index_t batch_stride_v,
-                  ck_tile::index_t batch_stride_bias,
-                  ck_tile::index_t batch_stride_randval,
-                  ck_tile::index_t batch_stride_lse,
-                  ck_tile::index_t batch_stride_o,
-                  ck_tile::index_t window_size_left,
-                  ck_tile::index_t window_size_right,
-                  ck_tile::index_t mask_type,
-                  float p_drop,
-                  bool s_randval,
-                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                      drop_seed_offset)
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              float logits_soft_cap,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t batch_stride_q,
+              ck_tile::index_t batch_stride_k,
+              ck_tile::index_t batch_stride_v,
+              ck_tile::index_t batch_stride_bias,
+              ck_tile::index_t batch_stride_randval,
+              ck_tile::index_t batch_stride_lse,
+              ck_tile::index_t batch_stride_o,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              bool s_randval,
+              std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                  drop_seed_offset)
    {
        Kargs kargs{{q_ptr,
                     k_ptr,
@@ -468,51 +468,51 @@ struct FmhaBatchPrefillWithPagedKVCacheKernel

    template <bool Cond = kIsGroupMode>
    CK_TILE_HOST static constexpr std::enable_if_t<Cond, Kargs>
-    MakeKargsImpl(const void* q_ptr,
-                  const void* k_ptr,
-                  const void* v_ptr,
-                  const void* bias_ptr,
-                  void* rand_val_ptr,
-                  void* lse_ptr,
-                  void* o_ptr,
-                  const void* seqstart_q_ptr,
-                  ck_tile::index_t hdim_q,
-                  ck_tile::index_t hdim_v,
-                  ck_tile::index_t num_head_q,
-                  ck_tile::index_t nhead_ratio_qk,
-                  int32_t num_total_pages,
-                  const void* kv_indptr,
-                  const void* kv_page_indices,
+    MakeKargs(const void* q_ptr,
+              const void* k_ptr,
+              const void* v_ptr,
+              const void* bias_ptr,
+              void* rand_val_ptr,
+              void* lse_ptr,
+              void* o_ptr,
+              const void* seqstart_q_ptr,
+              ck_tile::index_t hdim_q,
+              ck_tile::index_t hdim_v,
+              ck_tile::index_t num_head_q,
+              ck_tile::index_t nhead_ratio_qk,
+              int32_t num_total_pages,
+              const void* kv_indptr,
+              const void* kv_page_indices,
 #if 0 // we assume page_block_size=1 for now
              const void* kv_last_page_lens,
              ck_tile::index_t page_block_size,
 #endif
-                  float scale_s,
-                  float scale_p,
-                  float scale_o,
-                  float logits_soft_cap,
-                  ck_tile::index_t stride_q,
-                  ck_tile::index_t stride_k,
-                  ck_tile::index_t stride_v,
-                  ck_tile::index_t stride_bias,
-                  ck_tile::index_t stride_randval,
-                  ck_tile::index_t stride_o,
-                  ck_tile::index_t nhead_stride_q,
-                  ck_tile::index_t nhead_stride_k,
-                  ck_tile::index_t nhead_stride_v,
-                  ck_tile::index_t nhead_stride_bias,
-                  ck_tile::index_t nhead_stride_randval,
-                  ck_tile::index_t nhead_stride_lse,
-                  ck_tile::index_t nhead_stride_o,
-                  ck_tile::index_t batch_stride_k,
-                  ck_tile::index_t batch_stride_v,
-                  ck_tile::index_t window_size_left,
-                  ck_tile::index_t window_size_right,
-                  ck_tile::index_t mask_type,
-                  float p_drop,
-                  bool s_randval,
-                  std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
-                      drop_seed_offset)
+              float scale_s,
+              float scale_p,
+              float scale_o,
+              float logits_soft_cap,
+              ck_tile::index_t stride_q,
+              ck_tile::index_t stride_k,
+              ck_tile::index_t stride_v,
+              ck_tile::index_t stride_bias,
+              ck_tile::index_t stride_randval,
+              ck_tile::index_t stride_o,
+              ck_tile::index_t nhead_stride_q,
+              ck_tile::index_t nhead_stride_k,
+              ck_tile::index_t nhead_stride_v,
+              ck_tile::index_t nhead_stride_bias,
+              ck_tile::index_t nhead_stride_randval,
+              ck_tile::index_t nhead_stride_lse,
+              ck_tile::index_t nhead_stride_o,
+              ck_tile::index_t batch_stride_k,
+              ck_tile::index_t batch_stride_v,
+              ck_tile::index_t window_size_left,
+              ck_tile::index_t window_size_right,
+              ck_tile::index_t mask_type,
+              float p_drop,
+              bool s_randval,
+              std::variant<std::pair<uint64_t, uint64_t>, std::pair<const void*, const void*>>
+                  drop_seed_offset)
    {
        Kargs kargs{{q_ptr,
                     k_ptr,
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_kernel.hpp
@@ -808,6 +808,7 @@ struct FmhaFwdKernel
              ck_tile::index_t window_size_left,
              ck_tile::index_t window_size_right,
              ck_tile::index_t mask_type,
+              ck_tile::index_t min_seqlen_q,
              float p_drop,
              bool s_randval,
              const std::tuple<uint64_t, uint64_t>& drop_seed_offset)
@@ -847,7 +848,7 @@ struct FmhaFwdKernel
            window_size_left,
            window_size_right,
            mask_type,
-            0, // min_seqlen_q
+            min_seqlen_q,
            p_drop,
            s_randval,
            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
@@ -890,6 +891,7 @@ struct FmhaFwdKernel
              ck_tile::index_t window_size_left,
              ck_tile::index_t window_size_right,
              ck_tile::index_t mask_type,
+              ck_tile::index_t min_seqlen_q,
              float p_drop,
              bool s_randval,
              const std::tuple<const void*, const void*>& drop_seed_offset)
@@ -929,6 +931,7 @@ struct FmhaFwdKernel
            window_size_left,
            window_size_right,
            mask_type,
+            min_seqlen_q,
            p_drop,
            s_randval,
            std::make_pair(std::get<0>(drop_seed_offset), std::get<1>(drop_seed_offset)));
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -787,12 +787,29 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
            constexpr index_t N0 = kNPerBlock / N1; // P

            constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-            static_assert(total_pixels % N1 == 0); // TODO: this is not always true?
-            constexpr index_t K3     = total_pixels / N1;
-            constexpr index_t kKPack = GetSmemKPackV<Problem>();
-            static_assert(kKPack % K3 == 0);
+            constexpr index_t kKPack       = GetSmemKPackV<Problem>();
+            constexpr index_t K3           = total_pixels / N1;
            constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
-            if constexpr(get_warp_size() % (K2 * N0) == 0)
+            if constexpr(total_pixels % N1 != 0 || kKPack % K3 != 0) // if K2 or K3 is not divisible
+            {
+                constexpr index_t kNPack = 32;
+                static_assert(kNPerBlock % kNPack == 0);
+                constexpr index_t K0   = kBlockSize / get_warp_size();
+                constexpr index_t N2   = 2;
+                constexpr index_t N1_m = kNPack / N2;
+                constexpr index_t N0_m = kNPerBlock / kNPack;
+                constexpr index_t K1   = get_warp_size() / N1_m;
+                constexpr index_t K2_m = kKPerBlock / K1;
+                return make_static_tile_distribution(
+                    tile_distribution_encoding<
+                        sequence<1>,
+                        tuple<sequence<N0_m, N1_m, N2>, sequence<K0, K1, K2_m>>,
+                        tuple<sequence<2>, sequence<2, 1>>, // K0, K1 N0
+                        tuple<sequence<0>, sequence<1, 1>>,
+                        sequence<1, 2, 1>, // N0 K2 N2
+                        sequence<0, 2, 2>>{});
+            }
+            else if constexpr(get_warp_size() % (kKPack / K3 * N0) == 0)
            {
                constexpr index_t K1 = get_warp_size() / (K2 * N0);
                constexpr index_t K0 = kBlockSize / get_warp_size();
@@ -860,12 +877,28 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
        constexpr index_t N1           = GetAlignmentV<Problem>();
        constexpr index_t N0           = kNPerBlock / N1;
        constexpr index_t total_pixels = kNPerBlock * kKPerBlock / kBlockSize;
-        static_assert(total_pixels % N1 == 0); // TODO: this is not always true?
-        constexpr index_t K3     = total_pixels / N1;
-        constexpr index_t kKPack = GetSmemKPackV<Problem>();
-        static_assert(kKPack % K3 == 0);
+        constexpr index_t K3           = total_pixels / N1;
+        constexpr index_t kKPack       = GetSmemKPackV<Problem>();
        constexpr index_t K2 = kKPack / K3; // TODO: this dimention could be outside single wave
-        if constexpr(get_warp_size() % (K2 * N0) == 0)
+        if constexpr(total_pixels % N1 != 0 || kKPack % K3 != 0) // if K2 or K3 is not divisible
+        {
+            constexpr index_t kNPack = 32;
+            static_assert(kNPerBlock % kNPack == 0);
+            constexpr index_t K0   = kBlockSize / get_warp_size();
+            constexpr index_t N2   = 2;
+            constexpr index_t N1_m = kNPack / N2;
+            constexpr index_t N0_m = kNPerBlock / kNPack;
+            constexpr index_t K1   = get_warp_size() / N1_m;
+            constexpr index_t K2_m = kKPerBlock / K1;
+            return make_static_tile_distribution(
+                tile_distribution_encoding<sequence<1>,
+                                           tuple<sequence<N0_m, N1_m, N2>, sequence<K0, K1, K2_m>>,
+                                           tuple<sequence<2>, sequence<2, 1>>, // K0, K1 N0
+                                           tuple<sequence<0>, sequence<1, 1>>,
+                                           sequence<1, 1, 2>, // N0 K2 <-> N2
+                                           sequence<0, 2, 2>>{});
+        }
+        else if constexpr(get_warp_size() % (kKPack / K3 * N0) == 0)
        {
            constexpr index_t K1 = get_warp_size() / (K2 * N0);
            constexpr index_t K0 = kBlockSize / get_warp_size();
--- a/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/fused_moegemm_shape.hpp
@@ -101,7 +101,7 @@ struct FusedMoeGemmShape
    static constexpr index_t Repeat_N1 = Block_N1 / ThreadPerBlock_N1;
    static constexpr index_t Repeat_K1 = Block_K1 / ThreadPerBlock_K1;

-    static constexpr index_t BlockSize = WarpSize * NumWarps;
+    static constexpr index_t BlockSize = get_warp_size() * NumWarps;

    // some assert
    static_assert(Block_M0 == Block_M1);
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -388,7 +388,7 @@ struct MoeSortingKernel
    }

    // reduce single pixel within a wave
-    template <typename T, typename F, index_t wave_size_ = WarpSize>
+    template <typename T, typename F, index_t wave_size_ = get_warp_size()>
    __device__ static constexpr T wave_reduce(T local, F reduce_f, number<wave_size_> = {})
    {
        // constexpr int wave_size = 64;
@@ -625,7 +625,7 @@ struct MoeSortingKernel
        {
            const index_t prefill_token = topk_mdiv.div(numel);
            // TODO: only support expert-tile like 8, 16, 32
-            static constexpr index_t experts_per_wave = WarpSize / Problem::ExpertTile;
+            static constexpr index_t experts_per_wave = get_warp_size() / Problem::ExpertTile;
            {
                index_t eid           = tid / experts_per_wave;
                index_t expert_offset = cumsum[eid] +
@@ -693,7 +693,7 @@ struct MoeSortingKernel
                                   void* smem) const
    {
        const index_t tid            = static_cast<index_t>(threadIdx.x);
-        const index_t wid            = __builtin_amdgcn_readfirstlane(tid / WarpSize);
+        const index_t wid            = __builtin_amdgcn_readfirstlane(tid / get_warp_size());
        const index_t lid            = __lane_id();
        constexpr index_t block_size = 256;           // blockDim.x;
        const index_t sub_tokens     = smem_rows - 2; // sub_tokens_mdiv.divisor;
@@ -798,7 +798,7 @@ struct MoeSortingKernel
                // NOTE: under this block can never use __syncthreads!
                int i_e_          = 0;
                int local_cumsum_ = 0;
-                for(; i_e_ < num_experts; i_e_ += WarpSize)
+                for(; i_e_ < num_experts; i_e_ += get_warp_size())
                {
                    int pre_cumsum_ = smem_cumsum(lid == 0 ? i_e_ : 0);
                    int local_cnt   = smem_cumsum(i_e_ + lid + 1);
@@ -843,7 +843,7 @@ struct MoeSortingKernel
                                                  // cumsum padded in case local cumsum is zero, but
                                                  // pre_sumsum has value, which will result int
                                                  // zero local cumsum(but we want at least padded)
-                    wave_cumsum<int, WarpSize>(local_cumsum_);
+                    wave_cumsum<int, get_warp_size()>(local_cumsum_);

                    if((i_e_ + lid) < num_experts)
                        smem_cumsum(i_e_ + lid + 1) = local_cumsum_;
@@ -851,7 +851,7 @@ struct MoeSortingKernel
                    if constexpr(Problem::LocalExpertMasking)
                    {
                        local_masking += pre_cumsum_masking;
-                        wave_cumsum<int, WarpSize>(local_masking);
+                        wave_cumsum<int, get_warp_size()>(local_masking);
                        if((i_e_ + lid) < num_experts)
                            smem_cumdup(i_e_ + lid + 1) = local_masking;
                    }
@@ -861,7 +861,7 @@ struct MoeSortingKernel
                    // than 0(which is not we want)
                    __builtin_amdgcn_s_waitcnt(0xc07f);
                }
-                if((lid + i_e_ - WarpSize) == (num_experts - 1))
+                if((lid + i_e_ - get_warp_size()) == (num_experts - 1))
                {
                    *p_total_tokens_post_pad = local_cumsum_;
                }
@@ -1109,7 +1109,7 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size()
    return chunk * sizeof(index_t);
 };

-template <typename T, typename F, index_t wave_size_ = WarpSize>
+template <typename T, typename F, index_t wave_size_ = get_warp_size()>
 CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
 {
    // constexpr int wave_size = 64;
@@ -1504,7 +1504,7 @@ struct MoeSortingMultiPhaseKernel_P1
    // in byte
    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
    {
-        return BLOCK_SIZE / WarpSize * sizeof(IndexType);
+        return BLOCK_SIZE / get_warp_size() * sizeof(IndexType);
    }

    CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -1546,8 +1546,8 @@ struct MoeSortingMultiPhaseKernel_P1
            cnt += impl::moe_sorting_wave_reduce(local_sum, f_sum);
        }

-        index_t lane_id = threadIdx.x % WarpSize;
-        index_t wave_id = threadIdx.x / WarpSize;
+        index_t lane_id = threadIdx.x % get_warp_size();
+        index_t wave_id = threadIdx.x / get_warp_size();

        // reduce cross wave
        IndexType* s = reinterpret_cast<IndexType*>(smem);
@@ -1560,7 +1560,7 @@ struct MoeSortingMultiPhaseKernel_P1
        if(threadIdx.x == 0)
        {
            index_t c = 0;
-            for(auto i = 0; i < (BLOCK_SIZE / WarpSize); i++)
+            for(auto i = 0; i < (BLOCK_SIZE / get_warp_size()); i++)
            {
                c += s[i];
            }
@@ -1660,7 +1660,7 @@ struct MoeSortingMultiPhaseKernel_P01
    // in byte
    CK_TILE_HOST static constexpr auto GetSmemSize()
    {
-        return BLOCK_SIZE / WarpSize * sizeof(IndexType);
+        return BLOCK_SIZE / get_warp_size() * sizeof(IndexType);
    }

    CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -1786,8 +1786,8 @@ struct MoeSortingMultiPhaseKernel_P01
                    cnt += impl::moe_sorting_wave_reduce(local_sum, f_sum);
                }

-                index_t lane_id = threadIdx.x % WarpSize;
-                index_t wave_id = threadIdx.x / WarpSize;
+                index_t lane_id = threadIdx.x % get_warp_size();
+                index_t wave_id = threadIdx.x / get_warp_size();

                // reduce cross wave
                IndexType* s = reinterpret_cast<IndexType*>(smem);
@@ -1801,7 +1801,7 @@ struct MoeSortingMultiPhaseKernel_P01
                if(threadIdx.x == 0)
                {
                    index_t c = 0;
-                    for(auto i = 0; i < (BLOCK_SIZE / WarpSize); i++)
+                    for(auto i = 0; i < (BLOCK_SIZE / get_warp_size()); i++)
                    {
                        c += s[i];
                    }
@@ -1880,7 +1880,7 @@ struct MoeSortingMultiPhaseKernel_P2
    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
    {
        // return 2 * BLOCK_SIZE * sizeof(IndexType);
-        return (4 + 2 * BLOCK_SIZE / WarpSize) * sizeof(IndexType);
+        return (4 + 2 * BLOCK_SIZE / get_warp_size()) * sizeof(IndexType);
    }

    // reduce single pixel within a wave
@@ -1905,8 +1905,8 @@ struct MoeSortingMultiPhaseKernel_P2
        IndexType* p_sorted_expert_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);

        const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-        index_t wave_id     = threadIdx.x / WarpSize;
-        index_t lane_id     = threadIdx.x % WarpSize;
+        index_t wave_id     = threadIdx.x / get_warp_size();
+        index_t lane_id     = threadIdx.x % get_warp_size();

        IndexType prev_cumsum_a = 0;
        IndexType prev_cumsum_b = 0;
@@ -1951,22 +1951,22 @@ struct MoeSortingMultiPhaseKernel_P2
            IndexType cumsum_b = b_;

            // Note: we first cumsum local round, then add previous cumsum
-            impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_a);
-            impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_b);
+            impl::moe_sorting_wave_cumsum<IndexType, get_warp_size()>(cumsum_a);
+            impl::moe_sorting_wave_cumsum<IndexType, get_warp_size()>(cumsum_b);

            __syncthreads();
-            if(lane_id == WarpSize - 1)
+            if(lane_id == get_warp_size() - 1)
            {
-                s[4 + wave_id]                         = cumsum_a;
-                s[4 + wave_id + BLOCK_SIZE / WarpSize] = cumsum_b;
+                s[4 + wave_id]                                = cumsum_a;
+                s[4 + wave_id + BLOCK_SIZE / get_warp_size()] = cumsum_b;
            }

            __syncthreads();

            // reduce cross wave
-            static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+            static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                IndexType prev_a = s[4 + i_w];
-                IndexType prev_b = s[4 + i_w + BLOCK_SIZE / WarpSize];
+                IndexType prev_b = s[4 + i_w + BLOCK_SIZE / get_warp_size()];
                prev_a           = wave_id > i_w ? prev_a : 0; // mask out
                prev_b           = wave_id > i_w ? prev_b : 0; // mask out
                cumsum_a += prev_a;
@@ -2083,7 +2083,7 @@ struct MoeSortingMultiPhaseKernel_P3
    // in byte
    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
    {
-        return (4 + BLOCK_SIZE / WarpSize) * sizeof(IndexType);
+        return (4 + BLOCK_SIZE / get_warp_size()) * sizeof(IndexType);
    }

    CK_TILE_DEVICE void operator()(Kargs kargs) const
@@ -2110,8 +2110,8 @@ struct MoeSortingMultiPhaseKernel_P3
            }
        }();
        int eid     = blockIdx.x;
-        int wave_id = threadIdx.x / WarpSize;
-        int lane_id = threadIdx.x % WarpSize;
+        int wave_id = threadIdx.x / get_warp_size();
+        int lane_id = threadIdx.x % get_warp_size();
        int e_start = p_expert_cumsum[eid];
        int e_end   = p_expert_cumsum[eid + 1];
        if constexpr(Problem::SkipExpertsWithZeroTokens)
@@ -2141,17 +2141,17 @@ struct MoeSortingMultiPhaseKernel_P3
            int i_topk = x - 1;          // topk of this token
            int i_show = x != 0 ? 1 : 0; // has this token or not
            int cumsum = i_show;
-            impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
+            impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);

            __syncthreads();
-            if(lane_id == WarpSize - 1)
+            if(lane_id == get_warp_size() - 1)
            {
                s[4 + wave_id] = cumsum;
            }
            __syncthreads();

            // reduce cross wave
-            static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+            static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                IndexType prev = s[4 + i_w];
                prev           = wave_id > i_w ? prev : 0; // mask out
                cumsum += prev;
@@ -2196,7 +2196,7 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_size_p23(int num_experts_)
 {
    constexpr index_t BLOCK_SIZE     = 256; // hardcoded 256
    const index_t expert_cumsum_elem = num_experts_ + 1;
-    return (4 + 2 * BLOCK_SIZE / WarpSize + expert_cumsum_elem) * sizeof(int);
+    return (4 + 2 * BLOCK_SIZE / get_warp_size() + expert_cumsum_elem) * sizeof(int);
 }
 } // namespace impl

@@ -2303,15 +2303,15 @@ struct MoeSortingMultiPhaseKernel_P23
            const IndexType* p_local_expert_mask =
                static_cast<const IndexType*>(kargs.p_local_expert_mask);
            IndexType* p_expert_cumsum      = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
-            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / WarpSize;
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / get_warp_size();
            IndexType* p_total_tokens_post_pad =
                reinterpret_cast<IndexType*>(kargs.p_total_tokens_post_pad);
            IndexType* p_sorted_expert_ids =
                reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);

            const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
-            index_t wave_id     = threadIdx.x / WarpSize;
-            index_t lane_id     = threadIdx.x % WarpSize;
+            index_t wave_id     = threadIdx.x / get_warp_size();
+            index_t lane_id     = threadIdx.x % get_warp_size();

            IndexType prev_cumsum_a = 0;
            IndexType prev_cumsum_b = 0;
@@ -2356,22 +2356,22 @@ struct MoeSortingMultiPhaseKernel_P23
                IndexType cumsum_b = b_;

                // Note: we first cumsum local round, then add previous cumsum
-                impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_a);
-                impl::moe_sorting_wave_cumsum<IndexType, WarpSize>(cumsum_b);
+                impl::moe_sorting_wave_cumsum<IndexType, get_warp_size()>(cumsum_a);
+                impl::moe_sorting_wave_cumsum<IndexType, get_warp_size()>(cumsum_b);

                __syncthreads();
-                if(lane_id == WarpSize - 1)
+                if(lane_id == get_warp_size() - 1)
                {
-                    s[4 + wave_id]                         = cumsum_a;
-                    s[4 + wave_id + BLOCK_SIZE / WarpSize] = cumsum_b;
+                    s[4 + wave_id]                                = cumsum_a;
+                    s[4 + wave_id + BLOCK_SIZE / get_warp_size()] = cumsum_b;
                }

                __syncthreads();

                // reduce cross wave
-                static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+                static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                    IndexType prev_a = s[4 + i_w];
-                    IndexType prev_b = s[4 + i_w + BLOCK_SIZE / WarpSize];
+                    IndexType prev_b = s[4 + i_w + BLOCK_SIZE / get_warp_size()];
                    prev_a           = wave_id > i_w ? prev_a : 0; // mask out
                    prev_b           = wave_id > i_w ? prev_b : 0; // mask out
                    cumsum_a += prev_a;
@@ -2441,13 +2441,13 @@ struct MoeSortingMultiPhaseKernel_P23
            IndexType* s                  = reinterpret_cast<IndexType*>(smem);
            MeshType* p_expert_mesh       = reinterpret_cast<MeshType*>(kargs.p_expert_mesh);
            IndexType* p_sorted_token_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_token_ids);
-            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / WarpSize;
+            IndexType* p_expert_cumsum_smem = s + 4 + 2 * BLOCK_SIZE / get_warp_size();
            const WeightType* p_weights     = static_cast<const WeightType*>(kargs.p_weights);
            WeightType* p_sorted_weights    = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);

            int eid     = blockIdx.x;
-            int wave_id = threadIdx.x / WarpSize;
-            int lane_id = threadIdx.x % WarpSize;
+            int wave_id = threadIdx.x / get_warp_size();
+            int lane_id = threadIdx.x % get_warp_size();
            int e_start = p_expert_cumsum_smem[eid];
            int e_end   = p_expert_cumsum_smem[eid + 1];
            if constexpr(Problem::SkipExpertsWithZeroTokens)
@@ -2518,17 +2518,17 @@ struct MoeSortingMultiPhaseKernel_P23
                        int i_topk  = x - 1;          // topk of this token
                        int i_show  = x != 0 ? 1 : 0; // has this token or not
                        int cumsum  = i_show;
-                        impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
+                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);

                        __syncthreads();
-                        if(lane_id == WarpSize - 1)
+                        if(lane_id == get_warp_size() - 1)
                        {
                            s[4 + wave_id] = cumsum;
                        }
                        __syncthreads();

                        // reduce cross wave
-                        static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+                        static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                            IndexType prev = s[4 + i_w];
                            prev           = wave_id > i_w ? prev : 0; // mask out
                            cumsum += prev;
@@ -2569,17 +2569,17 @@ struct MoeSortingMultiPhaseKernel_P23
                            cumsum_store += i_show[j];
                        });
                        int cumsum = cumsum_store;
-                        impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
+                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);

                        __syncthreads();
-                        if(lane_id == WarpSize - 1)
+                        if(lane_id == get_warp_size() - 1)
                        {
                            s[4 + wave_id] = cumsum;
                        }
                        __syncthreads();

                        // reduce cross wave
-                        static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+                        static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                            IndexType prev = s[4 + i_w];
                            prev           = wave_id > i_w ? prev : 0; // mask out
                            cumsum += prev;
@@ -2624,17 +2624,17 @@ struct MoeSortingMultiPhaseKernel_P23
                        int i_topk_1  = x1 - 1;          // topk of this token
                        int i_show_1  = x1 != 0 ? 1 : 0; // has this token or not
                        int cumsum  = i_show_0 + i_show_1;
-                        impl::moe_sorting_wave_cumsum<int, WarpSize>(cumsum);
+                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);

                        __syncthreads();
-                        if(lane_id == WarpSize - 1)
+                        if(lane_id == get_warp_size() - 1)
                        {
                            s[4 + wave_id] = cumsum;
                        }
                        __syncthreads();

                        // reduce cross wave
-                        static_for<0, BLOCK_SIZE / WarpSize - 1, 1>{}([&](auto i_w) {
+                        static_for<0, BLOCK_SIZE / get_warp_size() - 1, 1>{}([&](auto i_w) {
                            IndexType prev = s[4 + i_w];
                            prev           = wave_id > i_w ? prev : 0; // mask out
                            cumsum += prev;
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -215,7 +215,7 @@ struct BlockUniversalGemmAsBsCr
        using BLdsTile = decltype(make_static_distributed_tensor<ComputeDataType>(BLdsTileDistr));

        ALdsTile a_warp_tile_;
-        ALdsTile b_warp_tile_;
+        BLdsTile b_warp_tile_;

        // C += A * B
        template <typename CBlockTensor, typename ASmemBlockWindow, typename BSmemBlockWindow>
--- a/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_kernel.hpp
@@ -59,14 +59,23 @@ struct GemmHostArgs
    const void* a_ptr;
    const void* b_ptr;
    const std::array<const void*, NumDTensor> ds_ptr;
-    void* e_ptr;
+    union
+    {
+        void* e_ptr;
+        void* c_ptr;
+    };
    index_t M;
    index_t N;
    index_t K;
    index_t stride_A;
    index_t stride_B;
    const std::array<index_t, NumDTensor> stride_Ds;
-    index_t stride_E;
+    union
+    {
+        index_t stride_E;
+        index_t stride_C;
+    };
+
    index_t k_batch;
 };

--- a/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm.hpp
@@ -172,7 +172,7 @@ using WarpGemmMfmaBf16Bf16F32M32N32K16TransposedCDistribution =
 #if defined(__gfx950__)
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
-        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K16<WGAttrCtlEnum::Default_>>>;
+        WarpGemmAttributeMfmaImplBf16Bf16F32M16N16K32<WGAttrCtlEnum::Default_>>>;
 #else
 using WarpGemmMfmaBf16Bf16F32M16N16K32TransposedCDistribution =
    WarpGemmImpl<WarpGemmAtrributeMfmaIterateKAndTransposedCDistribution<
@@ -282,4 +282,19 @@ using WarpGemmMfmaFp8Fp8F32M32N32K16SwizzleBTransposedCDistribution =
        2,
        swizzle_factor>>;

+// int8
+using WarpGemmMfma_i32_32x32x16_i8_i8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_i32_32x32x16_i8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_i32_16x16x32_i8_i8 = WarpGemmImpl<
+    WarpGemmAtrributeMfma<WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
+
+using WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed =
+    WarpGemmImpl<WarpGemmAtrributeMfmaTransposedCDistribution<
+        WarpGemmAttributeMfmaImpl_i32_16x16x32_i8<WGAttrCtlEnum::Default_>>>;
+
 } // namespace ck_tile
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp
@@ -1578,8 +1578,8 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
        DISPATCH_MFMA_CTRL_("v_mfma_i32_32x32x16_i8", Ctrl)
        else
        {
-#if defined(__gfx94__)
-            c_vec = __builtin_amdgcn_mfma_i32_32x32x8i8(
+#if defined(__gfx94__) or defined(__gfx95__)
+            c_vec = __builtin_amdgcn_mfma_i32_32x32x16_i8(
                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
 #elif defined(__gfx908__) || defined(__gfx90a__)
            static_for<0, 8, 1>{}([&](auto k) {
@@ -1609,6 +1609,183 @@ struct WarpGemmAttributeMfmaImpl_i32_32x32x16_i8
    }
 };

+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_i32_16x16x32_i8
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = int8_t;
+    using BDataType                     = int8_t;
+    using CDataType                     = int32_t;
+
+    using AVecType = ext_vector_t<ADataType, 8>;
+    using BVecType = ext_vector_t<BDataType, 8>;
+    using CVecType = ext_vector_t<CDataType, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 8;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4; // write to 4x AccVGPRs
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_i32_16x16x32_i8", Ctrl)
+        else
+        {
+#if defined(__gfx94__) or defined(__gfx95__)
+            c_vec = __builtin_amdgcn_mfma_i32_16x16x32_i8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        CVecType c_vec{0};
+        operator()(c_vec, a_vec, b_vec);
+        return c_vec;
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_i32_16x16x64_i8
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = int8_t;
+    using BDataType                     = int8_t;
+    using CDataType                     = int32_t;
+
+    using AVecType = ext_vector_t<ADataType, 16>;
+    using BVecType = ext_vector_t<BDataType, 16>;
+    using CVecType = ext_vector_t<CDataType, 4>;
+
+    static constexpr index_t kM = 16;
+    static constexpr index_t kN = 16;
+    static constexpr index_t kK = 64;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 16;
+    static constexpr index_t kBNLane     = 16;
+    static constexpr index_t kABKLane    = 4;
+    static constexpr index_t kABKPerLane = 16;
+
+    static constexpr index_t kCMLane     = 4;
+    static constexpr index_t kCNLane     = 16;
+    static constexpr index_t kCM0PerLane = 1;
+    static constexpr index_t kCM1PerLane = 4; // write to 4x AccVGPRs
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_i32_16x16x64_i8", Ctrl)
+        else
+        {
+#if defined(__gfx95__)
+            c_vec = __builtin_amdgcn_mfma_i32_16x16x64_i8(
+                bit_cast<long>(a_vec), bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        CVecType c_vec{0};
+        operator()(c_vec, a_vec, b_vec);
+        return c_vec;
+    }
+};
+
+template <WGAttrCtlEnum Ctrl_ = WGAttrCtlEnum::Default_>
+struct WarpGemmAttributeMfmaImpl_i32_32x32x32_i8
+{
+    static constexpr WGAttrCtlEnum Ctrl = Ctrl_;
+    using ADataType                     = int8_t;
+    using BDataType                     = int8_t;
+    using CDataType                     = int32_t;
+
+    using AVecType = ext_vector_t<ADataType, 16>;
+    using BVecType = ext_vector_t<BDataType, 16>;
+    using CVecType = ext_vector_t<CDataType, 16>;
+
+    static constexpr index_t kM = 32;
+    static constexpr index_t kN = 32;
+    static constexpr index_t kK = 32;
+
+    static constexpr index_t kAMBlock = 1;
+    static constexpr index_t kBNBlock = 1;
+
+    static constexpr index_t kAMLane     = 32;
+    static constexpr index_t kBNLane     = 32;
+    static constexpr index_t kABKLane    = 2;
+    static constexpr index_t kABKPerLane = 16;
+
+    static constexpr index_t kCMLane     = 2;
+    static constexpr index_t kCNLane     = 32;
+    static constexpr index_t kCM0PerLane = 4;
+    static constexpr index_t kCM1PerLane = 4;
+
+    // c_vec += a_vec * b_vec
+    template <bool post_nop_ = false>
+    CK_TILE_DEVICE void operator()(CVecType& c_vec,
+                                   const AVecType& a_vec,
+                                   const BVecType& b_vec,
+                                   bool_constant<post_nop_> = {}) const
+    {
+        DISPATCH_MFMA_CTRL_("v_mfma_i32_32x32x32_i8", Ctrl)
+        else
+        {
+#if defined(__gfx95__)
+            c_vec =
+                __builtin_amdgcn_mfma_i32_32x32x32_i8(a_vec, bit_cast<long>(b_vec), c_vec, 0, 0, 0);
+#else
+            ck_tile::ignore = c_vec;
+            ck_tile::ignore = a_vec;
+            ck_tile::ignore = b_vec;
+#endif
+        }
+    }
+
+    // c_vec = a_vec * b_vec
+    CK_TILE_DEVICE CVecType operator()(const AVecType& a_vec, const BVecType& b_vec) const
+    {
+        CVecType c_vec{0};
+        operator()(c_vec, a_vec, b_vec);
+        return c_vec;
+    }
+};
+
 #undef DISPATCH_MFMA_

 } // namespace ck_tile
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp
@@ -11,7 +11,7 @@ namespace ck_tile {
 namespace impl {
 template <typename AType,
          typename BType,
-          typename CType,
+          typename AccType,
          index_t MPerWave,
          index_t NPerWave,
          index_t KPerWave,
@@ -22,6 +22,7 @@ struct WarpGemmMfmaDispatcher;

 // clang-format off
 // fp16
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaF16F16F32M32N32K8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaF16F16F32M32N32K16; };
@@ -37,10 +38,12 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaF16F16F32M32N32K16SwizzleA; };

 // fp16 2:4 structural sparsity
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 32, 32, 16, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M32N32K16; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::half_t, ck_tile::half_t, float, 16, 16, 32, false, false, true> { using Type = WarpGemmSmfmacF16F16F32M16N16K32; };

 // bf16
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32,  8, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K8TransposedCDistribution; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16; };
@@ -56,6 +59,7 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf16_t, ck_tile::bf16_t, float, 32, 32, 16, false, true> { using Type = WarpGemmMfmaBf16Bf16F32M32N32K16SwizzleA; };

 // fp8
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  16, false> { using Type = WarpGemmMfma_f32_32x32x16_fp8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 32, 32,  32, false> { using Type = WarpGemmMfma_f32_32x32x32_fp8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::fp8_t, float, 16, 16,  32, false> { using Type = WarpGemmMfma_f32_16x16x32_fp8_fp8; };
@@ -81,12 +85,19 @@ template<> struct WarpGemmMfmaDispatcher<ck_tile::fp8_t, ck_tile::bf8_t, float,
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::fp8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_fp8; };
 template<> struct WarpGemmMfmaDispatcher<ck_tile::bf8_t, ck_tile::bf8_t, float, 32, 32,  64, false> { using Type = WarpGemmMfma_f32_32x32x64_bf8_bf8; };

+// int8
+// ADataType, BDataType, AccDataType, MPerWave, NPerWave, KPerWave, TransposeC, SwizzleA, UseStructuredSparsity
+template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, false> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 32, 32,  16, true> { using Type = WarpGemmMfma_i32_32x32x16_i8_i8_CTransposed; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, false> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8; };
+template<> struct WarpGemmMfmaDispatcher<ck_tile::int8_t, ck_tile::int8_t, ck_tile::int32_t, 16, 16,  32, true> { using Type = WarpGemmMfma_i32_16x16x32_i8_i8_CTransposed; };
+
 // clang-format on
 } // namespace impl

 template <typename AType,
          typename BType,
-          typename CType,
+          typename AccType,
          index_t MPerWave,
          index_t NPerWave,
          index_t KPerWave,
@@ -95,7 +106,7 @@ template <typename AType,
          bool UseStructuredSparsity = false>
 using WarpGemmMfmaDispatcher = typename impl::WarpGemmMfmaDispatcher<AType,
                                                                     BType,
-                                                                     CType,
+                                                                     AccType,
                                                                     MPerWave,
                                                                     NPerWave,
                                                                     KPerWave,
--- a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
+++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
@@ -250,7 +250,7 @@ struct BlockNormReduceCrossWarpSync
        //    | w0 | w1 | w2 | w3 |   ----->  | w0123 |
        //
        //   -> also store data from every wave into LDS
-        constexpr index_t num_warps = BlockShape::BlockSize / WarpSize;
+        constexpr index_t num_warps = BlockShape::BlockSize / get_warp_size();
        return num_warps * 4 * thread_buf_size * sizeof(float);
    }

@@ -276,7 +276,7 @@ struct BlockNormReduceCrossWarpSync
        const index_t lane_id           = get_lane_id();
        const index_t warp_id           = get_warp_id();
        constexpr auto num_reduce_warps = GetReduceWarps<MeanDistributedTensor_>();
-        constexpr index_t num_warps     = BlockShape::BlockSize / WarpSize;
+        constexpr index_t num_warps     = BlockShape::BlockSize / get_warp_size();
        const index_t smem_offset       = warp_id;

        // skip if nonthing to do
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm2.hpp
@@ -6,6 +6,7 @@
 #include <iostream>
 #include <sstream>
 #include <unordered_map>
+#include <mutex>

 #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
@@ -85,6 +86,7 @@ struct ReferenceMoeGemm2 : public device::BaseOperator

        float Run(const Argument& arg)
        {
+            std::vector<std::mutex> n_locks(arg.c_t_n_.mDesc.GetLengths()[1]);
            arg.c_t_n_.SetZero();
            auto f_mk_kn_mn = [&](auto m, auto n) {
                const int K = arg.a_t_k_k_.mDesc.GetLengths()[2];
@@ -142,8 +144,8 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
                            ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
                    }
                    CDataType v_c{0};
-                    D0DataType v_d0 = arg.d0_(m, n); // a
-                    D0DataType v_d1 = arg.d1_(e, n); // b
+                    D0DataType v_d0 = arg.d0_(t, topk_id); // a
+                    D0DataType v_d1 = arg.d1_(e, n);       // b
                    if constexpr(MulRoutedWeight)
                    {
                        arg.c_element_op_(v_c, v_acc, v_d0, v_d1, v_topk_w);
@@ -152,6 +154,7 @@ struct ReferenceMoeGemm2 : public device::BaseOperator
                    {
                        arg.c_element_op_(v_c, v_acc, v_d0, v_d1, 1.f);
                    }
+                    std::lock_guard<std::mutex> lock(n_locks[n]);
                    arg.c_t_n_(t, n) += v_c;
                }
            };
--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -16,6 +16,46 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 namespace instance {
+
+#ifdef CK_USE_WMMA
+#ifdef CK_ENABLE_FP16
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+#endif // CK_ENABLE_BF16
+#endif // CK_USE_WMMA
+#ifdef CK_USE_XDL
 #ifdef CK_ENABLE_BF16
 void add_device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instances(
    std::vector<std::unique_ptr<
@@ -124,6 +164,8 @@ void add_device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instances(
                                                  PassThrough,
                                                  PassThrough>>>& instances);
 #endif
+#endif // CK_USE_XDL
+
 template <typename ALayout,
          typename BLayout,
          typename CLayout,
@@ -154,6 +196,66 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
    static auto GetInstances()
    {
        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+#ifdef CK_USE_WMMA
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instances(op_ptrs);
+            }
+        }
+#endif // CK_ENABLE_FP16
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instances(
+                    op_ptrs);
+            }
+        }
+#endif // CK_ENABLE_BF16
+#endif // CK_USE_WMMA
+#ifdef CK_USE_XDL
 #ifdef CK_ENABLE_FP32
        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
                     is_same_v<CDataType, float>)
@@ -258,6 +360,7 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceBatche
            }
        }
 #endif
+#endif // CK_USE_XDL
        return op_ptrs;
    }
 };
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp
@@ -179,6 +179,38 @@ using device_grouped_conv_fwd_xdl_f16_instances = std::tuple<
    // clang-format on
    >;

+template <index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          ConvolutionForwardSpecialization ConvSpec,
+          typename DsDataTypes  = Tuple<>,
+          typename OutElementOp = PassThrough>
+using device_grouped_conv_fwd_xdl_f16_nchw_instances = std::tuple<
+    // clang-format off
+        //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+        //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+    // generic instance
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   64,    128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              1,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              1>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  64,    64,     32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 8, 1, 8>,               1>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,    64,    32,   8,   8,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,          1,          1,           1,               S<1, 32, 1, 8>,              1>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  64,    64,     32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 8, 1, 8>,               1>,
+    // 32x32 instance 
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,   128,    32,   8,   8,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,    64,    32,   8,   8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   64,    128,    32,   8,   8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              1,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  64,    64,     32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,          1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,         1,           1,           1,               S<1, 8, 1, 8>,                4>,
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  128,   128,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              4,              8,          1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               4>,
+    // 16x16 instance
+    DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsDataTypes,   F16, PassThrough, PassThrough, OutElementOp,       ConvSpec, GemmMNKPadding,        1,  256,   128,    64,    32,   8,   8,   16,   16,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,          1,              2,              8,          1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,            2,              8,              8,          1,          1,           1,               S<1, 32, 1, 8>,               4>
+    // clang-format on
+    >;
+
 template <index_t NDimSpatial,
          typename ALayout,
          typename BLayout,
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -688,7 +688,6 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
                                                                PassThrough,
                                                                PassThrough,
                                                                PassThrough>>>& instances);
-
 void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instances(
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NGCDHW,
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -295,11 +295,8 @@ FOREACH(subdir_path ${dir_list})

        if(MIOPEN_REQ_LIBS_ONLY)
            message(STATUS "Removing all sources that are not required for MIOpen")
-            if("${cmake_instance}" MATCHES "gemm" OR 
-               "${cmake_instance}" MATCHES "mha" OR 
-               "${cmake_instance}" MATCHES "contraction" OR 
-               "${cmake_instance}" MATCHES "reduce")
-                    set(add_inst 0)
+            if(NOT "${cmake_instance}" MATCHES "conv")
+                set(add_inst 0)
            endif()
        endif()

@@ -328,7 +325,7 @@ ENDFOREACH()



-if(CK_DEVICE_OTHER_INSTANCES)
+if(CK_DEVICE_OTHER_INSTANCES AND NOT MIOPEN_REQ_LIBS_ONLY)
        add_library(device_other_operations ${CK_DEVICE_OTHER_INSTANCES})
        add_library(composablekernels::device_other_operations ALIAS device_other_operations)
        set_target_properties(device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/CMakeLists.txt
@@ -1,19 +1,27 @@
-# ONLY XDL_KERNELS
+# ONLY XDL_AND_WMMA_KERNELS
 set(BATCHED_GEMM_INSTANCES)
-list(APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
-                                      device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
-                                      device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp)
+list(APPEND BATCHED_GEMM_INSTANCES device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f32_f32_f32_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f32_f32_f32_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f32_f32_f32_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_f32_f32_f32_gkm_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_int8_int8_int8_gmk_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_int8_int8_int8_gmk_gnk_gmn_instance.cpp
+                                   device_batched_gemm_xdl_int8_int8_int8_gkm_gkn_gmn_instance.cpp
+                                   device_batched_gemm_xdl_int8_int8_int8_gkm_gnk_gmn_instance.cpp)
 add_instance_library(device_batched_gemm_instance ${BATCHED_GEMM_INSTANCES})
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_comp_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gkn_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_comp_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gkm_gnk_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,76 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_comp_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gkn_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,79 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using BF16 = bhalf_t;
+using F32  = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row, BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_comp_instances<
+            GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_bf16_bf16_bf16_gmk_gnk_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -0,0 +1,70 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_comp_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gkn_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -0,0 +1,72 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_comp_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gkm_gnk_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -0,0 +1,75 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         0,          1,          1,        S<1, 32, 1, 2>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_comp_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gkn_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_wmma_cshuffle_v3.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/host_utility/device_prop.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using F16 = half_t;
+using F32 = float;
+
+using Row = tensor_layout::gemm::RowMajor;
+using Col = tensor_layout::gemm::ColumnMajor;
+
+template <index_t... Is>
+using S = Sequence<Is...>;
+
+using PassThrough = element_wise::PassThrough;
+
+static constexpr auto GemmDefault    = GemmSpecialization::Default;
+static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
+
+static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
+static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
+
+template <GemmSpecialization GemmSpec>
+using device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_comp_instances =
+    std::tuple<
+        // clang-format off
+        //################################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| CShuffle|           A|           B|           C| GemmSpec| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MRepeat| NRepeat|    ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|    BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CShuffleBlockTransfer| CShuffleBlockTransfer|    BlkGemm|                      BlkGemm|
+        //################################|        |        |        | Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise|         |  Size| Block| Block| Block|    |    | Wmma| Wmma|        |        |     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|     ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN|    MRepeat|    NRepeat|        ClusterLengths|       ScalarPerVector|  PipeSched|                  PipelineVer|
+        //################################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|         |      |      |      |      |    |    |     |     |        |        | Lengths_AK0_M_AK1|   ArrangeOrder|               |               |      PerVector|  PerVector_AK1|          | Lengths_BK0_N_BK1|   ArrangeOrder|               |               |      PerVector|  PerVector_BK1|          | PerShuffle| PerShuffle|     _MBlock_MPerBlock|            _NPerBlock|           |                             |
+        //################################|        |        |        |     |      |      |        |         |            |            |            |         |      |      |      |      |    |    |     |     |        |        |                  |               |               |               |               |               |          |                  |               |               |               |               |               |          |           |           |     _NBlock_NPerBlock|                      |           |                             |
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,   8,   16,   16,       4,       4,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceBatchedGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        // clang-format on
+        >;
+
+void add_device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_instances(
+    std::vector<std::unique_ptr<
+        DeviceBatchedGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_comp_instances<GemmDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_wmma_universal_f16_f16_f16_gmk_gnk_gmn_comp_instances<
+            GemmMNKPadding>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -1,5 +1,5 @@
 # XDL_DL_WMMA_KERNELS
-add_instance_library(device_grouped_conv2d_fwd_instance
+set(GROUPED_CONV2D_FWD
   #xdl
   # GNHWC, GKYXC, GNHWK
   xdl/device_grouped_conv2d_fwd_xdl_gnhwc_gkyxc_gnhwk_bf16_instance.cpp
@@ -19,8 +19,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
   # NGCHW, GKCYX, NGKHW
-   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
-   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_16x16_instance.cpp
   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_16x16_instance.cpp
@@ -46,12 +44,10 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
   # NHWGC, GKYXC, NHWGK
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
   # NGCHW, GKCYX, NGKHW
   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_mem_intra_instance.cpp
@@ -71,7 +67,6 @@ add_instance_library(device_grouped_conv2d_fwd_instance
   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_bf16_comp_part2_instance.cpp
   xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f16_comp_part2_instance.cpp
   # NGCHW, GKCYX, NGKHW
-   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_comp_instance.cpp
   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f32_comp_instance.cpp
   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_2x_instance.cpp
@@ -105,3 +100,47 @@ add_instance_library(device_grouped_conv2d_fwd_instance
  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_f16_oddc_instance.cpp
  wmma/device_grouped_conv2d_fwd_wmma_nhwgc_gkyxc_nhwgk_i8_oddc_instance.cpp
 )
+# Add generated files for sharded instantiations.
+include(ShardInstantiation)
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.in
+  NUM_SHARDS 16
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
+  NUM_SHARDS 21
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
+  NUM_SHARDS 21
+  SRC_LIST GROUPED_CONV2D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+add_instance_library(device_grouped_conv2d_fwd_instance ${GROUPED_CONV2D_FWD})
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instance.cpp
@@ -1,16 +1,14 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
 #include "ck/host_utility/device_prop.hpp"
+#include "ck/utility/filter_tuple.hpp"

-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances =
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                NGCHW,
                                                                GKCYX,
@@ -22,19 +20,23 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances(
                                                                BF16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances_shard([[maybe_unused]]
+    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_comp_instances& instances)
 {
    add_device_operation_instances(
        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
-                                                        NGCHW,
-                                                        GKCYX,
-                                                        Empty_Tuple,
-                                                        NGKHW,
-                                                        ConvFwdDefault>{});
+        ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_comp_instances<2,
+                                                                                           NGCHW,
+                                                                                           GKCYX,
+                                                                                           Empty_Tuple,
+                                                                                           NGKHW,
+                                                                                           ConvFwdDefault>,
+                                           Shards,
+                                           ShardIndex>{});
 }

-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instance.cpp
@@ -3,13 +3,11 @@

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"

-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances =
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
                                                                NGCHW,
                                                                GKCYX,
@@ -21,32 +19,40 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances(
                                                                BF16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances_shard(
+    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_bf16_instances& instances)
 {
    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                              NGCHW,
                                                                              GKCYX,
                                                                              Empty_Tuple,
                                                                              NGKHW,
-                                                                              ConvFwdDefault>{});
+                                                                              ConvFwdDefault>,
+                                   Shards,
+                                   ShardIndex>{});
    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                              NGCHW,
                                                                              GKCYX,
                                                                              Empty_Tuple,
                                                                              NGKHW,
-                                                                              ConvFwd1x1P0>{});
+                                                                              ConvFwd1x1P0>,
+                                   Shards,
+                                   ShardIndex>{});
    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<2,
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<2,
                                                                              NGCHW,
                                                                              GKCYX,
                                                                              Empty_Tuple,
                                                                              NGKHW,
-                                                                              ConvFwd1x1S1P0>{});
+                                                                              ConvFwd1x1S1P0>,
+                                   Shards,
+                                   ShardIndex>{});
 }

-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instance.cpp
@@ -1,52 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NGCHW,
-                                                                GKCYX,
-                                                                Empty_Tuple,
-                                                                NGKHW,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKCYX,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKCYX,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwd1x1P0>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<2,
-                                                                             NGCHW,
-                                                                             GKCYX,
-                                                                             Empty_Tuple,
-                                                                             NGKHW,
-                                                                             ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.cpp
@@ -1,66 +1,67 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"

-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances(
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances =
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
+                                                                NGCHW,
+                                                                GKCYX,
                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
+                                                                NGKHW,
+                                                                F16,
+                                                                F16,
                                                                Empty_Tuple,
-                                                                int8_t,
+                                                                F16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances_shard(
+    device_grouped_conv2d_fwd_xdl_ngchw_gkcyx_ngkhw_f16_instances& instances)
 {
    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdDefault,
-                                                                                  Intrawave>{});
-
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwdDefault>,
+                                   Shards,
+                                   ShardIndex>{});
    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Intrawave>{});
-
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1P0>,
+                                   Shards,
+                                   ShardIndex>{});
    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Intrawave>{});
-
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1S1P0>,
+                                   Shards,
+                                   ShardIndex>{});
    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Intrawave>{});
+                                   ck::util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_nchw_instances<2,
+                                                                              NGCHW,
+                                                                              GKCYX,
+                                                                              Empty_Tuple,
+                                                                              NGKHW,
+                                                                              ConvFwd1x1S1P0>,
+                                   Shards,
+                                   ShardIndex>{});
 }

-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
@@ -31,6 +31,14 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances(
                                                          Empty_Tuple,
                                                          NGKHW,
                                                          ConvFwdDefault>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_nchw_instances<2,
+                                                       NGCHW,
+                                                       GKYXC,
+                                                       Empty_Tuple,
+                                                       NGKHW,
+                                                       ConvFwd1x1S1P0>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
@@ -1,66 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
-                                                                NHWGC,
-                                                                GKYXC,
-                                                                Empty_Tuple,
-                                                                NHWGK,
-                                                                int8_t,
-                                                                int8_t,
-                                                                Empty_Tuple,
-                                                                int8_t,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdDefault,
-                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Interwave>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_int8_mem_instances<2,
-                                                                                  NHWGC,
-                                                                                  GKYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NHWGK,
-                                                                                  ConvFwdOddC,
-                                                                                  Interwave>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.in
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances_shard(
+    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwdOddC,
+                                                                                      Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instance.in
@@ -0,0 +1,80 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NHWGC,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NHWGK,
+                                                                int8_t,
+                                                                int8_t,
+                                                                Empty_Tuple,
+                                                                int8_t,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances_shard(
+    device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_intra_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwdDefault,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           Empty_Tuple,
+                                                           NHWGK,
+                                                           ConvFwd1x1S1P0,
+                                                           Intrawave>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_int8_mem_instances<2,
+                                                                                      NHWGC,
+                                                                                      GKYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NHWGK,
+                                                                                      ConvFwdOddC,
+                                                                                      Intrawave>,
+                                       Shards,
+                                       ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/CMakeLists.txt
@@ -11,8 +11,6 @@ set(GROUPED_CONV3D_FWD
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_16x16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_16x16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_int8_instance.cpp
-   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
-   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_16x16_instance.cpp
   xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
@@ -32,23 +30,13 @@ set(GROUPED_CONV3D_FWD
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_inter_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.cpp

   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_mem_intra_instance.cpp
   xdl/mem/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.cpp
-   xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.cpp
   
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
-   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
+      xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f32_comp_instance.cpp
+xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_comp_instance.cpp
   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_2x_instance.cpp
   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_2x_instance.cpp
   xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_part2_instance.cpp
@@ -71,6 +59,99 @@ set(GROUPED_CONV3D_FWD
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_f16_oddc_instance.cpp
   wmma/device_grouped_conv3d_fwd_wmma_ndhwgc_gkzyxc_ndhwgk_i8_oddc_instance.cpp
 )
+# Add generated files for sharded instantiations.
+include(ShardInstantiation)
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.in
+  NUM_SHARDS 8
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances
+  TEMPLATE_FILE xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.in
+  NUM_SHARDS 8
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl
+)
+
+set(GENERATED_DIR ${CMAKE_CURRENT_BINARY_DIR}/generated)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_inter_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_intra_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instances
+  TEMPLATE_FILE xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f32_mem_intra_instance.in
+  NUM_SHARDS 10
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/mem
+)
+
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)
+generate_sharded_instantiations(
+  INSTANCES_NAME device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances
+  TEMPLATE_FILE xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
+  NUM_SHARDS 12
+  SRC_LIST GROUPED_CONV3D_FWD
+  OUTPUT_DIR ${GENERATED_DIR}/xdl/comp
+)

 if((DTYPES MATCHES "fp8" AND DTYPES MATCHES "fp16") OR NOT DEFINED DTYPES)
    list(APPEND GROUPED_CONV3D_FWD
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.cpp
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NDHWGC,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        NDHWGK,
-                                                        ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                                                   NDHWGC,
-                                                                                   GKZYXC,
-                                                                                   Empty_Tuple,
-                                                                                   NDHWGK,
-                                                                                   ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NDHWGC,
-                                                        GKZYXC,
-                                                        Empty_Tuple,
-                                                        NDHWGK,
-                                                        ConvFwd1x1S1P0>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
-                                                                  NDHWGC,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NDHWGK,
-                                                                  ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
-                                                                  NDHWGC,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NDHWGK,
-                                                                  ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
-                                                                  NDHWGC,
-                                                                  GKZYXC,
-                                                                  Empty_Tuple,
-                                                                  NDHWGK,
-                                                                  ConvFwd1x1S1P0>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
-                                                               NDHWGC,
-                                                               GKZYXC,
-                                                               Empty_Tuple,
-                                                               NDHWGK,
-                                                               ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
-                                                               NDHWGC,
-                                                               GKZYXC,
-                                                               Empty_Tuple,
-                                                               NDHWGK,
-                                                               ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
-                                                               NDHWGC,
-                                                               GKZYXC,
-                                                               Empty_Tuple,
-                                                               NDHWGK,
-                                                               ConvFwd1x1S1P0>{});
-    }
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instance.in
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+
+    if(ck::get_device_name() != "gfx950")
+    {
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_part2<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances_2x<3,
+                                                            NDHWGC,
+                                                            GKZYXC,
+                                                            Empty_Tuple,
+                                                            NDHWGK,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});    }
+}
+
+} // namespace ck::tensor_operation::device::instance
+
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.cpp
@@ -1,111 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/host_utility/device_prop.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NDHWGC,
-                                                                GKZYXC,
-                                                                Empty_Tuple,
-                                                                NDHWGK,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NDHWGC,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       NDHWGK,
-                                                       ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                  NDHWGC,
-                                                                                  GKZYXC,
-                                                                                  Empty_Tuple,
-                                                                                  NDHWGK,
-                                                                                  ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NDHWGC,
-                                                       GKZYXC,
-                                                       Empty_Tuple,
-                                                       NDHWGK,
-                                                       ConvFwd1x1S1P0>{});
-
-    if(ck::get_device_name() != "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
-                                                                 NDHWGC,
-                                                                 GKZYXC,
-                                                                 Empty_Tuple,
-                                                                 NDHWGK,
-                                                                 ConvFwd1x1S1P0>{});
-    }
-
-    if(ck::get_device_name() == "gfx950")
-    {
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              ConvFwdDefault>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              ConvFwd1x1P0>{});
-        add_device_operation_instances(
-            instances,
-            device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
-                                                              NDHWGC,
-                                                              GKZYXC,
-                                                              Empty_Tuple,
-                                                              NDHWGK,
-                                                              ConvFwd1x1S1P0>{});
-    }
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instance.in
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                      NDHWGC,
+                                                                                      GKZYXC,
+                                                                                      Empty_Tuple,
+                                                                                      NDHWGK,
+                                                                                      ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           Empty_Tuple,
+                                                           NDHWGK,
+                                                           ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+    
+        if(ck::get_device_name() != "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwdDefault>,
+                Shards,
+                ShardIndex>{});
+
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd1x1P0>,
+                Shards,
+                ShardIndex>{});
+
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_part2<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd1x1S1P0>,
+                Shards,
+                ShardIndex>{});
+    }
+
+    if(ck::get_device_name() == "gfx950")
+    {
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwdDefault>,
+                Shards,
+                ShardIndex>{});
+
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd1x1P0>,
+                Shards,
+                ShardIndex>{});
+
+        add_device_operation_instances(
+            instances,
+            util::filter_tuple_by_modulo_t<
+                device_grouped_conv_fwd_xdl_f16_comp_instances_2x<3,
+                                                                NDHWGC,
+                                                                GKZYXC,
+                                                                Empty_Tuple,
+                                                                NDHWGK,
+                                                                ConvFwd1x1S1P0>,
+                Shards,
+                ShardIndex>{}); 
+    }
+}
+
+} // namespace ck::tensor_operation::device::instance
+
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.cpp
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NGCDHW,
-                                                        GKCZYX,
-                                                        Empty_Tuple,
-                                                        NGKDHW,
-                                                        ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                                                   NGCDHW,
-                                                                                   GKCZYX,
-                                                                                   Empty_Tuple,
-                                                                                   NGKDHW,
-                                                                                   ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
-                                                        NGCDHW,
-                                                        GKCZYX,
-                                                        Empty_Tuple,
-                                                        NGKDHW,
-                                                        ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instance.in
@@ -0,0 +1,65 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NGCDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGKDHW,
+                                                            ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NGCDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGKDHW,
+                                                            ConvFwd1x1P0>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_comp_instances<3,
+                                                            NGCDHW,
+                                                            GKCZYX,
+                                                            Empty_Tuple,
+                                                            NGKDHW,
+                                                            ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
+
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.cpp
@@ -1,54 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NGCDHW,
-                                                       GKCZYX,
-                                                       Empty_Tuple,
-                                                       NGKDHW,
-                                                       ConvFwdDefault>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                                                  NGCDHW,
-                                                                                  GKCZYX,
-                                                                                  Empty_Tuple,
-                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1P0>{});
-    add_device_operation_instances(
-        instances,
-        device_grouped_conv_fwd_xdl_f16_comp_instances<3,
-                                                       NGCDHW,
-                                                       GKCZYX,
-                                                       Empty_Tuple,
-                                                       NGKDHW,
-                                                       ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/comp/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instance.in
@@ -0,0 +1,63 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                F16,
+                                                                F16,
+                                                                Empty_Tuple,
+                                                                F16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_comp_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwdDefault>,
+            Shards,
+            ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                                                      NGCDHW,
+                                                                                      GKCZYX,
+                                                                                      Empty_Tuple,
+                                                                                      NGKDHW,
+                                                                                      ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_f16_comp_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwd1x1S1P0>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instance.cpp
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                BF16,
-                                                                BF16,
-                                                                Empty_Tuple,
-                                                                BF16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
-                                                                              NGCDHW,
-                                                                              GKCZYX,
-                                                                              Empty_Tuple,
-                                                                              NGKDHW,
-                                                                              ConvFwdDefault>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
-                                                                              NGCDHW,
-                                                                              GKCZYX,
-                                                                              Empty_Tuple,
-                                                                              NGKDHW,
-                                                                              ConvFwd1x1P0>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_instances<3,
-                                                                              NGCDHW,
-                                                                              GKCZYX,
-                                                                              Empty_Tuple,
-                                                                              NGKDHW,
-                                                                              ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instance.cpp
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"

-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {

-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_instances(
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances =
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NGCDHW,
                                                                GKCZYX,
@@ -21,35 +20,43 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_intra_insta
                                                                BF16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_instances& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                  ConvFwdDefault,
-                                                                                  Intrawave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  ConvFwdDefault>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1P0,
-                                                                                  Intrawave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                  ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_bf16_instances<3,
                                                                                  NGCDHW,
                                                                                  GKCZYX,
                                                                                  Empty_Tuple,
                                                                                  NGKDHW,
-                                                                                  ConvFwd1x1S1P0,
-                                                                                  Intrawave>{});
+                                                                                  ConvFwd1x1S1P0>,
+                                       Shards,
+                                       ShardIndex>{});
 }

-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instance.cpp
@@ -47,6 +47,15 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_16x16_instances(
                                                        Empty_Tuple,
                                                        NGKDHW,
                                                        ConvFwd1x1S1P0>{});
+
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_fwd_xdl_f16_nchw_instances<3,
+                                                       NGCDHW,
+                                                       GKCZYX,
+                                                       Empty_Tuple,
+                                                       NGKDHW,
+                                                       ConvFwd1x1S1P0>{});
 }

 } // namespace instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instance.cpp
@@ -1,53 +0,0 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
-
-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
-// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances(
-    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
-                                                                NGCDHW,
-                                                                GKCZYX,
-                                                                Empty_Tuple,
-                                                                NGKDHW,
-                                                                F16,
-                                                                F16,
-                                                                Empty_Tuple,
-                                                                F16,
-                                                                PassThrough,
-                                                                PassThrough,
-                                                                PassThrough>>>& instances)
-{
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<3,
-                                                                             NGCDHW,
-                                                                             GKCZYX,
-                                                                             Empty_Tuple,
-                                                                             NGKDHW,
-                                                                             ConvFwdDefault>{});
-
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<3,
-                                                                             NGCDHW,
-                                                                             GKCZYX,
-                                                                             Empty_Tuple,
-                                                                             NGKDHW,
-                                                                             ConvFwd1x1P0>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_instances<3,
-                                                                             NGCDHW,
-                                                                             GKCZYX,
-                                                                             Empty_Tuple,
-                                                                             NGKDHW,
-                                                                             ConvFwd1x1S1P0>{});
-}
-
-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instance.cpp
@@ -1,15 +1,14 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.

-#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"

-namespace ck {
-namespace tensor_operation {
-namespace device {
-namespace instance {
+namespace ck::tensor_operation::device::instance {

-void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instances(
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances =
    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
                                                                NGCDHW,
                                                                GKCZYX,
@@ -21,35 +20,43 @@ void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_mem_inter_instan
                                                                F16,
                                                                PassThrough,
                                                                PassThrough,
-                                                                PassThrough>>>& instances)
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_f16_instances& instances)
 {
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
                                                                                 NGCDHW,
                                                                                 GKCZYX,
                                                                                 Empty_Tuple,
                                                                                 NGKDHW,
-                                                                                 ConvFwdDefault,
-                                                                                 Interwave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 ConvFwdDefault>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
                                                                                 NGCDHW,
                                                                                 GKCZYX,
                                                                                 Empty_Tuple,
                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1P0,
-                                                                                 Interwave>{});
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_fwd_xdl_f16_mem_instances<3,
+                                                                                 ConvFwd1x1P0>,
+                                       Shards,
+                                       ShardIndex>{});
+
+    add_device_operation_instances(
+        instances,
+        util::filter_tuple_by_modulo_t<device_grouped_conv_fwd_xdl_f16_instances<3,
                                                                                 NGCDHW,
                                                                                 GKCZYX,
                                                                                 Empty_Tuple,
                                                                                 NGKDHW,
-                                                                                 ConvFwd1x1S1P0,
-                                                                                 Interwave>{});
+                                                                                 ConvFwd1x1S1P0>,
+                                       Shards,
+                                       ShardIndex>{});
 }

-} // namespace instance
-} // namespace device
-} // namespace tensor_operation
-} // namespace ck
+} // namespace ck::tensor_operation::device::instance
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd/xdl/mem/device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instance.in
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_mem_instance.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/utility/filter_tuple.hpp"
+
+namespace ck::tensor_operation::device::instance {
+
+using device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances =
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
+                                                                NGCDHW,
+                                                                GKCZYX,
+                                                                Empty_Tuple,
+                                                                NGKDHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>;
+template <int Shards, int ShardIndex>
+void add_device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances_shard(
+    device_grouped_conv3d_fwd_xdl_ngcdhw_gkczyx_ngkdhw_bf16_mem_inter_instances& instances)
+{
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwdDefault,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+    add_device_operation_instances(instances,
+                                   ck::util::filter_tuple_by_modulo_t<
+                                       device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                                                      NGCDHW,
+                                                                                      GKCZYX,
+                                                                                      Empty_Tuple,
+                                                                                      NGKDHW,
+                                                                                      ConvFwd1x1P0,
+                                                                                      Interwave>,
+                                       Shards,
+                                       ShardIndex>{});
+    add_device_operation_instances(
+        instances,
+        ck::util::filter_tuple_by_modulo_t<
+            device_grouped_conv_fwd_xdl_bf16_mem_instances<3,
+                                                           NGCDHW,
+                                                           GKCZYX,
+                                                           Empty_Tuple,
+                                                           NGKDHW,
+                                                           ConvFwd1x1S1P0,
+                                                           Interwave>,
+            Shards,
+            ShardIndex>{});
+}
+
+} // namespace ck::tensor_operation::device::instance
--- a/Show More
+++ b/Show More