Merge branch 'develop' into lwpck-2836

2026-06-06 15:54:31 +00:00 · 2025-02-21 14:59:53 -06:00
parent 3fe8d318bf ffa13455a2
commit ddaa893d7b
260 changed files with 16502 additions and 1713 deletions
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -4,8 +4,9 @@
 #pragma once

 #include "ck/config.h"
+
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include "ck/utility/env.hpp"
-#ifndef CK_CODE_GEN_RTC
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -3,6 +3,7 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <string>
 #include <map>
 #include <hip/hip_runtime.h>
@@ -97,3 +98,4 @@ inline bool is_gfx12_supported()
 }

 } // namespace ck
+#endif
--- a/include/ck/host_utility/hip_check_error.hpp
+++ b/include/ck/host_utility/hip_check_error.hpp
@@ -19,16 +19,15 @@ inline void hip_check_error(hipError_t x)
    }
 }

-#define HIP_CHECK_ERROR(retval_or_funcall)                                 \
-    do                                                                     \
-    {                                                                      \
-        hipError_t _tmpVal = retval_or_funcall;                            \
-        if(_tmpVal != hipSuccess)                                          \
-        {                                                                  \
-            std::ostringstream ostr;                                       \
-            ostr << "HIP Function Failed ("                                \
-                 << "hip_check_error.hpp"                                  \
-                 << "," << __LINE__ << ") " << hipGetErrorString(_tmpVal); \
-            throw std::runtime_error(ostr.str());                          \
-        }                                                                  \
+#define HIP_CHECK_ERROR(retval_or_funcall)                                         \
+    do                                                                             \
+    {                                                                              \
+        hipError_t _tmpVal = retval_or_funcall;                                    \
+        if(_tmpVal != hipSuccess)                                                  \
+        {                                                                          \
+            std::ostringstream ostr;                                               \
+            ostr << "HIP Function Failed (" << __FILE__ << "," << __LINE__ << ") " \
+                 << hipGetErrorString(_tmpVal);                                    \
+            throw std::runtime_error(ostr.str());                                  \
+        }                                                                          \
    } while(0)
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <hip/hip_runtime.h>

 #include "ck/ck.hpp"
@@ -166,3 +166,4 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
    return 0;
 #endif
 }
+#endif
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_ab_scale_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_ab_scale_selector.hpp
@@ -9,13 +9,6 @@

 namespace ck {

-enum struct BlockGemmPipelineVersion
-{
-    v1, // Naive
-    v2, // Mem
-    v3, // Comp
-};
-
 template <BlockGemmPipelineVersion BlkGemmPipelineVer,
          BlockGemmPipelineScheduler BlkGemmPipeSche,
          index_t BlockSize,
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp"
+namespace ck {
+
+template <BlockGemmPipelineVersion BlkGemmPipelineVer,
+          BlockGemmPipelineScheduler BlkGemmPipeSche,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack>
+constexpr auto BlockGemmBPreshufflePipeline_Selector()
+{
+    if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+    {
+        return BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlkGemmPipeSche,
+                                                           BlockSize,
+                                                           ADataType,
+                                                           BDataType,
+                                                           ComputeDataType,
+                                                           AccDataType,
+                                                           ATileDesc,
+                                                           BTileDesc,
+                                                           AMmaTileDesc,
+                                                           BMmaTileDesc,
+                                                           ABlockTransferSrcScalarPerVector,
+                                                           BBlockTransferSrcScalarPerVector,
+                                                           MPerBlock,
+                                                           NPerBlock,
+                                                           KPerBlock,
+                                                           MPerXDL,
+                                                           NPerXDL,
+                                                           MRepeat,
+                                                           NRepeat,
+                                                           KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+    {
+        return BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlkGemmPipeSche,
+                                                           BlockSize,
+                                                           ADataType,
+                                                           BDataType,
+                                                           ComputeDataType,
+                                                           AccDataType,
+                                                           ATileDesc,
+                                                           BTileDesc,
+                                                           AMmaTileDesc,
+                                                           BMmaTileDesc,
+                                                           ABlockTransferSrcScalarPerVector,
+                                                           BBlockTransferSrcScalarPerVector,
+                                                           MPerBlock,
+                                                           NPerBlock,
+                                                           KPerBlock,
+                                                           MPerXDL,
+                                                           NPerXDL,
+                                                           MRepeat,
+                                                           NRepeat,
+                                                           KPack>{};
+    }
+    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+    {
+        static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
+        return BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlkGemmPipeSche,
+                                                           BlockSize,
+                                                           ADataType,
+                                                           BDataType,
+                                                           ComputeDataType,
+                                                           AccDataType,
+                                                           ATileDesc,
+                                                           BTileDesc,
+                                                           AMmaTileDesc,
+                                                           BMmaTileDesc,
+                                                           ABlockTransferSrcScalarPerVector,
+                                                           BBlockTransferSrcScalarPerVector,
+                                                           MPerBlock,
+                                                           NPerBlock,
+                                                           KPerBlock,
+                                                           MPerXDL,
+                                                           NPerXDL,
+                                                           MRepeat,
+                                                           NRepeat,
+                                                           KPack>{};
+    }
+    else
+    {
+        std::cerr << "BlockGemmPipeline configuration is not available" << std::endl;
+    }
+}
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
@@ -0,0 +1,506 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::Intrawave,
+                                                   BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t PrefetchStages  = 2;
+    static constexpr index_t PrefillStages   = 1;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+        });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        // Global prefetch A1 B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_buf);
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    block_sync_lds();
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_buf);
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(I1));
+
+            block_sync_lds();
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_buf);
+                });
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v2.hpp
@@ -0,0 +1,558 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 3
+// LocalPreFillStages: 2
+// LocalPreFetchStages: 2
+// LocalSharedMemoryBuffer: 2
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v2<BlockGemmPipelineScheduler::Intrawave,
+                                                   BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    static constexpr index_t PrefetchStages  = 3;
+    static constexpr index_t PrefillStages   = 2;
+    static constexpr index_t GlobalBufferNum = 2;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        // constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        // B global + A local
+        static_for<0, num_buffer_load_inst_b / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read B
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read   A
+        });
+
+        static_for<0, num_buffer_load_inst_b / 2, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read B
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read   A
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        // static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
+        //     ignore = i;
+        //     __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+        //     __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
+        // });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_thread_buf), Number<2>{}> a_thread_bufs;
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        // Global prefetch A1, B1
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0), I0);
+
+        // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_block_buf.At(I0),
+                                   a_thread_desc_,
+                                   make_tuple(m0, I0, I0, k0, I0, I0),
+                                   a_thread_bufs(I0));
+            });
+        });
+
+        // Local prefill A2
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1), I1);
+
+        // // Global prefetch A3
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    block_sync_lds();
+
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(local_read_buf));
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_block_buf.At(local_read_buf),
+                                               a_thread_desc_,
+                                               make_tuple(m0, I0, I0, k0, I0, I0),
+                                               a_thread_bufs(local_read_buf));
+                        });
+                    });
+
+                    a_blockwise_copy.RunWrite(
+                        a_block_desc, a_block_buf.At(mfma_reg_buf), mfma_reg_buf);
+
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_bufs[mfma_reg_buf]
+                                                     [Number<a_thread_desc_.CalculateOffset(
+                                                         make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 3));
+        }
+        // tail
+
+        auto ReadWriteCompFunc = [&](auto mfma_reg, auto local_read_reg) {
+            block_sync_lds();
+
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(local_read_reg));
+            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf.At(local_read_reg),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_bufs(local_read_reg));
+                });
+            });
+
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(mfma_reg), mfma_reg);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_bufs[mfma_reg][Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[mfma_reg][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            HotLoopScheduler();
+            __builtin_amdgcn_sched_barrier(0);
+        };
+
+        auto ReadCompFunc = [&](auto mfma_reg, auto local_read_reg) {
+            block_sync_lds();
+
+            b_blockwise_copy.Run(b_grid_desc,
+                                 b_grid_buf,
+                                 b_block_desc_n0_n1_k0_k1,
+                                 b_block_origin_idx,
+                                 b_thread_bufs(local_read_reg));
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_block_buf.At(local_read_reg),
+                                       a_thread_desc_,
+                                       make_tuple(m0, I0, I0, k0, I0, I0),
+                                       a_thread_bufs(local_read_reg));
+                });
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_bufs[mfma_reg][Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[mfma_reg][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+
+            HotLoopScheduler();
+            __builtin_amdgcn_sched_barrier(0);
+        };
+
+        auto CompFunc = [&](auto mfma_reg) {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, NRepeat, 1>{}([&](auto n0) {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_bufs[mfma_reg][Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[mfma_reg][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        };
+
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            ReadCompFunc(I0, I1);
+            CompFunc(I1);
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            ReadWriteCompFunc(I0, I1);
+            ReadCompFunc(I1, I0);
+            CompFunc(I0);
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat>{}, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v3.hpp
@@ -0,0 +1,860 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Compute optimized pipeline
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPacks>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3
+{
+};
+
+template <index_t BlockSize,
+          typename ADataType,
+          typename BDataType,
+          typename ComputeDataType,
+          typename AccDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat,
+          index_t NRepeat,
+          index_t KPack
+          // ,bool TransposeC //disable transposec right now...
+          >
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v3<BlockGemmPipelineScheduler::Intrawave,
+                                                   BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>
+    : BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                        ADataType,
+                                        BDataType,
+                                        ComputeDataType,
+                                        AccDataType,
+                                        ATileDesc,
+                                        BTileDesc,
+                                        AMmaTileDesc,
+                                        BMmaTileDesc,
+                                        ABlockTransferSrcScalarPerVector,
+                                        BBlockTransferSrcScalarPerVector,
+                                        MPerBlock,
+                                        NPerBlock,
+                                        KPerBlock,
+                                        MPerXDL,
+                                        NPerXDL,
+                                        MRepeat,
+                                        NRepeat,
+                                        KPack>
+
+{
+    using Base = BlockwiseGemmXdlops_pipeline_base<BlockSize,
+                                                   ADataType,
+                                                   BDataType,
+                                                   ComputeDataType,
+                                                   AccDataType,
+                                                   ATileDesc,
+                                                   BTileDesc,
+                                                   AMmaTileDesc,
+                                                   BMmaTileDesc,
+                                                   ABlockTransferSrcScalarPerVector,
+                                                   BBlockTransferSrcScalarPerVector,
+                                                   MPerBlock,
+                                                   NPerBlock,
+                                                   KPerBlock,
+                                                   MPerXDL,
+                                                   NPerXDL,
+                                                   MRepeat,
+                                                   NRepeat,
+                                                   KPack>;
+    using Base::A_K1;
+    using Base::B_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::I2;
+    using Base::KRepeat;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::a_block_desc_m0_m1_m2_k;
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::CalculateCThreadOriginDataIndex8D;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::AMmaKStride;
+    using Base::BMmaKStride;
+
+    using Base::MWaves;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 1;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    template <typename TileDesc_M0_M1_M2_K>
+    __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
+    {
+        constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
+        constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
+        constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
+        constexpr index_t K2 = KPack;
+        constexpr index_t K1 = 64 / NPerXDL;
+        constexpr index_t K0 = KRepeat;
+
+        return transform_tensor_descriptor(
+            TileDesc_M0_M1_M2_K{},
+            make_tuple(
+                make_pass_through_transform(Number<M0>{}),
+                make_pass_through_transform(Number<M1>{}),
+                make_pass_through_transform(Number<M2>{}),
+                make_unmerge_transform(make_tuple(Number<K0>{}, Number<K1>{}, Number<K2>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3, 4, 5>{}));
+    }
+
+    static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
+        MakeAGemmMmaTileDescriptor(a_block_desc_m0_m1_m2_k);
+
+    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    template <typename Stage>
+    __device__ static constexpr auto HotLoopScheduler(Stage stage)
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        if constexpr(stage.value == 0)
+        {
+            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
+                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
+            constexpr auto staged_num_mfma_per_buffer_load_b =
+                staged_num_mfma / num_buffer_load_inst_b;
+            // B global
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_buffer_load_b_per_ds_read_a - 1, 1>{}([&](auto ibuf_inst) {
+                    ignore = ibuf_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                });
+
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 1)
+        {
+            constexpr auto staged_num_mfma_per_ds_write_a =
+                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
+
+            // A local write
+            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 2)
+        {
+            constexpr auto staged_num_mfma_per_buffer_load_a =
+                math::integer_divide_ceil(staged_num_mfma, num_buffer_load_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_buffer_load_a - 1) * num_buffer_load_inst_a;
+
+            // A global
+            static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a, 0);  // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);    // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);    // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_buffer_load_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+                    }
+                }
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            // A local Read
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    template <typename Stage>
+    __device__ static constexpr auto EpilogueScheduler_1(Stage stage)
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_ds_write_inst_a    = HotLoopInstList::A_LDS_Write_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = MWaves * HotLoopInstList::B_Buffer_Load_Inst_Num;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        if constexpr(stage.value == 0)
+        {
+            constexpr auto staged_num_buffer_load_b_per_ds_read_a =
+                num_buffer_load_inst_b / staged_num_ds_read_inst_a;
+            constexpr auto staged_num_mfma_per_buffer_load_b =
+                staged_num_mfma / num_buffer_load_inst_b;
+            // B global
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_buffer_load_b_per_ds_read_a, 1>{}([&](auto ibuf_inst) {
+                    ignore = ibuf_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_buffer_load_b, 0);  // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                });
+
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_buffer_load_b - 1, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x020, 1, 0);    // VMEM read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else if constexpr(stage.value == 1)
+        {
+#if 0
+            constexpr auto staged_num_ds_write_a_per_ds_read_a =
+                num_ds_write_inst_a / staged_num_ds_read_inst_a;
+            constexpr auto staged_num_mfma_per_ds_write_a = staged_num_mfma / num_ds_write_inst_a;
+            // A local write
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+
+                static_for<0, staged_num_ds_write_a_per_ds_read_a, 1>{}([&](auto idswrite_inst) {
+                    ignore = idswrite_inst;
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                    __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                });
+
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_ds_write_a_per_ds_read_a, 0); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0);  // DS read
+            });
+#elif 1
+            constexpr auto staged_num_mfma_per_ds_write_a =
+                math::integer_divide_ceil(staged_num_mfma, num_ds_write_inst_a);
+
+            constexpr auto stage_more_mfma =
+                staged_num_mfma - (staged_num_mfma_per_ds_write_a - 1) * num_ds_write_inst_a;
+
+            // A local write
+            static_for<0, num_ds_write_inst_a, 1>{}([&](auto i_inst) {
+                if constexpr(i_inst.value < stage_more_mfma)
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a, 0);     // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+                else
+                {
+                    if(i_inst.value < staged_num_ds_read_inst_a)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 2, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                    }
+                    else
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x008, staged_num_mfma_per_ds_write_a - 1, 0); // MFMA
+                        __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS Write
+                    }
+                }
+            });
+#endif
+            __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            // A local Read
+            static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+                ignore = i_inst;
+                __builtin_amdgcn_sched_group_barrier(
+                    0x008, staged_num_mfma_per_ds_read_a, 0);      // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+            });
+
+            __builtin_amdgcn_sched_barrier(0);
+        }
+    }
+
+    __device__ static constexpr auto EpilogueScheduler_2()
+    {
+        constexpr auto num_ds_read_inst_a = HotLoopInstList::A_LDS_Read_Inst_Num;
+
+        constexpr auto num_mfma = HotLoopInstList::C_MFMA_Inst_Num;
+
+        constexpr auto staged_num_ds_read_inst_a = num_ds_read_inst_a / MRepeat;
+        constexpr auto staged_num_mfma           = num_mfma / MRepeat;
+
+        constexpr auto staged_num_mfma_per_ds_read_a = staged_num_mfma / staged_num_ds_read_inst_a;
+
+        // A local Read
+        static_for<0, staged_num_ds_read_inst_a, 1>{}([&](auto i_inst) {
+            ignore = i_inst;
+            __builtin_amdgcn_sched_group_barrier(0x008, staged_num_mfma_per_ds_read_a, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+        });
+
+        __builtin_amdgcn_sched_barrier(0);
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer>
+    __device__ void Run(const AGridDesc& a_grid_desc,
+                        const ABlockDesc& a_block_desc,
+                        ABlockTransfer& a_blockwise_copy,
+                        const AGridBuffer& a_grid_buf,
+                        ABlockBuffer& a_block_buf,
+                        const ABlockTransferStep& a_block_copy_step,
+                        const BGridDesc& b_grid_desc,
+                        BBlockTransfer& b_blockwise_copy,
+                        const BGridBuffer& b_grid_buf,
+                        BBlockBuffer& b_block_buf,
+                        const BBlockTransferStep& b_block_copy_step,
+                        CThreadBuffer& c_thread_buf,
+                        index_t num_loop) const
+    {
+        ignore = b_block_buf;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeDataType>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
+
+        // Global prefetch A1 B1
+        b_blockwise_copy.Run(b_grid_desc,
+                             b_grid_buf,
+                             b_block_desc_n0_n1_k0_k1,
+                             b_block_origin_idx,
+                             b_thread_bufs(I0));
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // // Local prefill A1
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+
+        // // Global prefetch A2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+
+        // Local prefetch A1
+        block_sync_lds();
+        static_for<0, KRepeat, 1>{}([&](auto k0) {
+            a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                               make_tuple(I0, I0, I0, k0, I0, I0),
+                               a_block_buf.At(I0),
+                               a_thread_desc_,
+                               make_tuple(I0, I0, I0, k0, I0, I0),
+                               a_thread_buf);
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+
+        __builtin_amdgcn_sched_barrier(0);
+
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        if constexpr(m0.value == 0)
+                        {
+                            b_blockwise_copy.Run(b_grid_desc,
+                                                 b_grid_buf,
+                                                 b_block_desc_n0_n1_k0_k1,
+                                                 b_block_origin_idx,
+                                                 b_thread_bufs(local_read_buf));
+                            b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+                        }
+                        else if constexpr(m0.value == 1)
+                        {
+                            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(local_read_buf));
+                        }
+                        else if constexpr(m0.value == 2)
+                        {
+                            a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                            a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                        }
+
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeDataType, KPack> a_thread_vec;
+                                vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple((m0 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                                           2,
+                                                       I0,
+                                                       I0,
+                                                       k0,
+                                                       I0,
+                                                       ik))>{}];
+                                    b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                        b_thread_bufs[mfma_reg_buf]
+                                                     [Number<b_thread_desc_.CalculateOffset(
+                                                         make_tuple(n0, I0, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type =
+                                    typename vector_type<ComputeDataType,
+                                                         xdlops_gemm.K1PerXdlops>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                                xdlops_gemm.Run(
+                                    a_thread_vec.template AsType<mfma_input_type>(),
+                                    b_thread_vec.template AsType<mfma_input_type>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+
+                        if constexpr(m0.value == MRepeat - 1)
+                        {
+                            block_sync_lds();
+
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                                    a_block_buf.At(local_read_buf),
+                                    a_thread_desc_,
+                                    make_tuple(
+                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                               2>{},
+                                        I0,
+                                        I0,
+                                        k0,
+                                        I0,
+                                        I0),
+                                    a_thread_buf);
+                            });
+                        }
+                        else
+                        {
+                            static_for<0, KRepeat, 1>{}([&](auto k0) {
+                                a_thread_copy_.Run(
+                                    a_block_desc_m0_m1_m2_k0_k1_k2,
+                                    make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                                    a_block_buf.At(mfma_reg_buf),
+                                    a_thread_desc_,
+                                    make_tuple(
+                                        Number<(m0 + 1 + HotloopLocalBufSwitch * mfma_reg_buf) %
+                                               2>{},
+                                        I0,
+                                        I0,
+                                        k0,
+                                        I0,
+                                        I0),
+                                    a_thread_buf);
+                            });
+                        }
+
+                        HotLoopScheduler(m0);
+                    });
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                if constexpr(m0.value == 0)
+                {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc_n0_n1_k0_k1,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(I1));
+                }
+                else if constexpr(m0.value == MRepeat - 1)
+                {
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+                }
+
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value == MRepeat - 1)
+                {
+                    block_sync_lds();
+
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                            a_block_buf.At(I1),
+                            a_thread_desc_,
+                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
+                            a_thread_buf);
+                    });
+                }
+                else
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<(m0 + 1) % MRepeat>{}, I0, I0, k0, I0, I0),
+                            a_block_buf.At(I0),
+                            a_thread_desc_,
+                            make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
+                            a_thread_buf);
+                    });
+                }
+
+                EpilogueScheduler_1(m0);
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
+                                    (m0 + HotloopLocalBufSwitch) % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value != (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(
+                            a_block_desc_m0_m1_m2_k0_k1_k2,
+                            make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
+                            a_block_buf.At(I1),
+                            a_thread_desc_,
+                            make_tuple(
+                                Number<(m0 + 1 + HotloopLocalBufSwitch) % 2>{}, I0, I0, k0, I0, I0),
+                            a_thread_buf);
+                    });
+
+                    EpilogueScheduler_2();
+                }
+            });
+            // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
+            // latency
+            // __builtin_amdgcn_sched_barrier(0);
+        }
+        else
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        vector_type<ComputeDataType, KPack> a_thread_vec;
+                        vector_type<ComputeDataType, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(m0 % 2, I0, I0, k0, I0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeDataType>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(n0, I0, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type =
+                            typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
+
+                        constexpr index_t c_offset =
+                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
+
+                        xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
+                                        b_thread_vec.template AsType<mfma_input_type>(),
+                                        c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                if constexpr(m0.value != (MRepeat - 1))
+                {
+                    static_for<0, KRepeat, 1>{}([&](auto k0) {
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_k0_k1_k2,
+                                           make_tuple(Number<m0 + 1>{}, I0, I0, k0, I0, I0),
+                                           a_block_buf.At(I0),
+                                           a_thread_desc_,
+                                           make_tuple(Number<(m0 + 1) % 2>{}, I0, I0, k0, I0, I0),
+                                           a_thread_buf);
+                    });
+
+                    EpilogueScheduler_2();
+                }
+            });
+        }
+    }
+
+    protected:
+    // MRepeat MWave MLane KRepeat KLane KPack
+    // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
+    // Reduce the vgpr usage here.
+    static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(I2, I1, I1, Number<KRepeat>{}, I1, Number<KPack>{}));
+
+    using AThreadCopy = ThreadwiseTensorSliceTransfer_v4<ADataType,
+                                                         ComputeDataType,
+                                                         decltype(a_block_desc_m0_m1_m2_k0_k1_k2),
+                                                         decltype(a_thread_desc_),
+                                                         Sequence<1, 1, 1, 1, 1, KPack>,
+                                                         Sequence<0, 1, 2, 3, 4, 5>,
+                                                         5,
+                                                         A_K1,
+                                                         A_K1>;
+
+    AThreadCopy a_thread_copy_{Base::CalculateAThreadOriginDataIndex6D()};
+
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat>{}, I1, Number<KRepeat>{}, Number<KPack>{}));
+
+    static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
+
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_scale_selector.hpp
@@ -11,15 +11,6 @@

 namespace ck {

-enum struct BlockGemmPipelineVersion
-{
-    v1, // Naive
-    v2, // Mem
-    v3, // Comp
-    v4, // Comp, double lds buffer
-    v5, // Comp, double global prefetch register buffer
-};
-
 template <BlockGemmPipelineVersion BlkGemmPipelineVer,
          BlockGemmPipelineScheduler BlkGemmPipeSche,
          index_t BlockSize,
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
@@ -54,8 +54,9 @@ struct BlockwiseGemmXdlops_pipeline_base
    static constexpr index_t AMmaKStride = KPack;
    static constexpr index_t BMmaKStride = KPack;

-    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
-    static constexpr index_t KRepeat    = KPerThread / KPack;
+    static constexpr index_t KPerThread    = KPerBlock / xdlops_gemm.K0PerXdlops;
+    static constexpr index_t KRepeat       = KPerThread / KPack;
+    static constexpr index_t KPerInnerLoop = KPack;

    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
@@ -112,6 +113,17 @@ struct BlockwiseGemmXdlops_pipeline_base
        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]);
    }

+    __device__ static auto CalculateAThreadOriginDataIndex6D()
+    {
+        const auto wave_idx = GetWaveIdx();
+
+        const auto waveId_m = wave_idx[I0];
+
+        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
+
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], 0, xdlops_a_idx[I0], 0);
+    }
+
    __device__ static auto CalculateBThreadOriginDataIndex()
    {
        const auto wave_idx = GetWaveIdx();
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_selector.hpp
@@ -11,15 +11,6 @@

 namespace ck {

-enum struct BlockGemmPipelineVersion
-{
-    v1, // Naive
-    v2, // Mem
-    v3, // Comp
-    v4, // Comp, double lds buffer
-    v5, // Comp, double global prefetch register buffer
-};
-
 template <BlockGemmPipelineVersion BlkGemmPipelineVer,
          BlockGemmPipelineScheduler BlkGemmPipeSche,
          index_t BlockSize,
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_v4.hpp
@@ -8,7 +8,7 @@
 namespace ck {

 // Compute optimimal pipeline with highest resource request
-// GlobalPrefetchStages: 4
+// GlobalPrefetchStages: 3
 // LocalPreFillStages: 2
 // LocalPreFetchStages: 1
 // LocalSharedMemoryBuffer: 2
@@ -142,9 +142,9 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
    using Base::AMmaKStride;
    using Base::BMmaKStride;

-    static constexpr index_t PrefetchStages  = 4;
+    static constexpr index_t PrefetchStages  = 3;
    static constexpr index_t PrefillStages   = 2;
-    static constexpr index_t GlobalBufferNum = 2;
+    static constexpr index_t GlobalBufferNum = 1;
    static constexpr index_t HotloopUnroll   = 2;

    __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
@@ -164,8 +164,7 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
        }
    }

-    template <typename ScheduleGroup>
-    __device__ static constexpr void HotLoopScheduler(ScheduleGroup schedule_group)
+    __device__ static constexpr void HotLoopScheduler()
    {
        // TODO: Take data type into consideration as pipe ver 3
        // A-B splited schedule
@@ -195,42 +194,42 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
            ignore = i;
            static_for<0, num_dsread_per_issue_a, 1>{}([&](auto idsread) {
                ignore = idsread;
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, schedule_group); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, schedule_group); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
            });

            static_for<0, num_dswrite_per_issue_a, 1>{}([&](auto idswrite) {
                ignore = idswrite;
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, schedule_group); // DS write
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, schedule_group); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
            });

-            __builtin_amdgcn_sched_group_barrier(0x020, 1, schedule_group); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
            __builtin_amdgcn_sched_group_barrier(0x008,
                                                 num_mfma_per_issue - num_dsread_per_issue_a -
                                                     num_dswrite_per_issue_a,
-                                                 schedule_group); // MFMA
+                                                 0); // MFMA
        });

        static_for<0, num_issue_b, 1>{}([&](auto i) {
            ignore = i;
            static_for<0, num_dsread_per_issue_b, 1>{}([&](auto idsread) {
                ignore = idsread;
-                __builtin_amdgcn_sched_group_barrier(0x100, 1, schedule_group); // DS read
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, schedule_group); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
            });

            static_for<0, num_dswrite_per_issue_b, 1>{}([&](auto idswrite) {
                ignore = idswrite;
-                __builtin_amdgcn_sched_group_barrier(0x200, 1, schedule_group); // DS write
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, schedule_group); // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
            });

-            __builtin_amdgcn_sched_group_barrier(0x020, 1, schedule_group); // VMEM read
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
            __builtin_amdgcn_sched_group_barrier(0x008,
                                                 num_mfma_per_issue - num_dsread_per_issue_a -
                                                     num_dswrite_per_issue_b,
-                                                 schedule_group); // MFMA
+                                                 0); // MFMA
        });
        __builtin_amdgcn_sched_barrier(0);
    }
@@ -274,26 +273,15 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;

        // Global prefetch 1
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
-
-        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
-        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
-
-        // Global prefetch 2
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);

        // Local prefill 1
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0), I0);
-        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I0), I0);
-
-        // Local prefill 2
-        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1), I1);
-        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I1), I1);
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I0));
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I0));

        // Local prefetch 1
        block_sync_lds();
@@ -316,16 +304,20 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
            });
        });

-        // Global prefetch 3
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I0);
+        // Global prefetch 2
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);

-        // Global prefetch 4
-        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I1);
-        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, I1);
+        // Local prefill 2
+        a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(I1));
+        b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(I1));
+
+        // Global prefetch 3
+        a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+        b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
@@ -343,9 +335,7 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                auto LoopFunc = [&](auto lds_read_buf,
                                    auto lds_read_reg_buf,
                                    auto lds_write_buf,
-                                    auto vmem_buf,
-                                    auto mfma_reg_buf,
-                                    auto schedule_group) {
+                                    auto mfma_reg_buf) {
                    block_sync_lds();

                    static_for<0, KRepeat, 1>{}([&](auto k) {
@@ -367,13 +357,11 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                        });
                    });

-                    a_blockwise_copy.RunWrite(
-                        a_block_desc, a_block_buf.At(lds_write_buf), vmem_buf);
-                    b_blockwise_copy.RunWrite(
-                        b_block_desc, b_block_buf.At(lds_write_buf), vmem_buf);
+                    a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf));
+                    b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf));

-                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, vmem_buf);
-                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf, vmem_buf);
+                    a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf);
+                    b_blockwise_copy.RunRead(b_grid_desc, b_grid_buf);

                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
@@ -410,11 +398,11 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                        });
                    });

-                    HotLoopScheduler(schedule_group);
+                    HotLoopScheduler();
                };

-                LoopFunc(I1, I1, I0, I0, I0, I0);
-                LoopFunc(I0, I0, I1, I1, I1, I0);
+                LoopFunc(I1, I1, I0, I0);
+                LoopFunc(I0, I0, I1, I1);

                i += HotloopUnroll;
            } while(i < (num_loop - PrefetchStages));
@@ -423,9 +411,7 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
        auto ReadWriteCompFunc = [&](auto lds_read_buf,
                                     auto lds_read_reg_buf,
                                     auto lds_write_buf,
-                                     auto vmem_buf,
-                                     auto mfma_reg_buf,
-                                     auto schedule_group) {
+                                     auto mfma_reg_buf) {
            block_sync_lds();

            static_for<0, KRepeat, 1>{}([&](auto k) {
@@ -447,8 +433,8 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                });
            });

-            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf), vmem_buf);
-            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf), vmem_buf);
+            a_blockwise_copy.RunWrite(a_block_desc, a_block_buf.At(lds_write_buf));
+            b_blockwise_copy.RunWrite(b_block_desc, b_block_buf.At(lds_write_buf));

            static_for<0, KRepeat, 1>{}([&](auto k0) {
                static_for<0, MRepeat, 1>{}([&](auto m0) {
@@ -478,13 +464,10 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                });
            });

-            HotLoopScheduler(schedule_group);
+            HotLoopScheduler();
        };

-        auto ReadCompFunc = [&](auto lds_read_buf,
-                                auto lds_read_reg_buf,
-                                auto mfma_reg_buf,
-                                auto schedule_group) {
+        auto ReadCompFunc = [&](auto lds_read_buf, auto lds_read_reg_buf, auto mfma_reg_buf) {
            block_sync_lds();

            static_for<0, KRepeat, 1>{}([&](auto k) {
@@ -534,7 +517,7 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
                });
            });

-            HotLoopScheduler(schedule_group);
+            HotLoopScheduler();
        };

        auto CompFunc = [&](auto mfma_reg_buf) {
@@ -569,15 +552,13 @@ struct BlockwiseGemmXdlops_pipeline_v4<BlockGemmPipelineScheduler::Intrawave,
        // tail
        if constexpr(TailNum == TailNumber::Odd)
        {
-            ReadWriteCompFunc(I1, I1, I0, I0, I0, I1);
-            ReadCompFunc(I0, I0, I1, I1);
+            ReadWriteCompFunc(I1, I1, I0, I0);
+            ReadCompFunc(I0, I0, I1);
            CompFunc(I0);
        }
        else if constexpr(TailNum == TailNumber::Even)
        {
-            ReadWriteCompFunc(I1, I1, I0, I0, I0, I1);
-            ReadWriteCompFunc(I0, I0, I1, I1, I1, I1);
-            ReadCompFunc(I1, I1, I0, I1);
+            ReadCompFunc(I1, I1, I0);
            CompFunc(I1);
        }
    }
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1.hpp
@@ -109,6 +109,12 @@ struct ThreadGroupTensorSliceTransfer_v4r1
        }
    }

+    template <typename SeqIdx, index_t ThreadScratchId = 0>
+    __device__ constexpr auto GetSrcThreadScratchIdx()
+    {
+        return threadwise_transfer_.template GetSrcThreadScratchIdx<SeqIdx, ThreadScratchId>();
+    }
+
    template <typename SrcBuffer, index_t ThreadScratchId = 0>
    __device__ void RunRead(const SrcDesc& src_desc,
                            const SrcBuffer& src_buf,
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -3,11 +3,12 @@

 #pragma once

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <string>
 #include <sstream>
 #include <regex>
 #include <optional>
+
 #include "ck/stream_config.hpp"
 #endif

@@ -15,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #define GET_OBJECT_NAME_IMLP                                                  \
    std::optional<std::string> GetObjectName() const override                 \
    {                                                                         \
@@ -77,7 +78,7 @@ struct BaseOperator
    BaseOperator()                    = default;
    BaseOperator(const BaseOperator&) = default;
    BaseOperator& operator=(const BaseOperator&) = default;
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
    virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
    virtual std::string GetTypeString() const { return ""; }

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
@@ -2,9 +2,10 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <vector>
+#endif

 #include "device_base.hpp"

@@ -28,6 +29,7 @@ template <typename ALayout,
          bool MaskOutUpperTriangle> // TODO: enum for mask type
 struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
 {
+#ifndef __HIPCC_RTC__
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_a,
                        const void* p_b0,
@@ -53,6 +55,7 @@ struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
                        CElementwiseOperation c_element_op) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+#endif
 };

 } // namespace device
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
@@ -2,9 +2,11 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <array>
+#endif

+#include "ck/utility/array.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"

 namespace ck {
@@ -34,6 +36,7 @@ struct DeviceGemmMultipleD : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

+#ifndef __HIPCC_RTC__
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
@@ -51,6 +54,7 @@ struct DeviceGemmMultipleD : public BaseOperator
                        CDEElementwiseOperation cde_element_op) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+#endif
 };

 // GEMM:
@@ -76,6 +80,7 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
 {
    static constexpr index_t NumDTensor = DsDataType::Size();

+#ifndef __HIPCC_RTC__
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
@@ -94,6 +99,52 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
                        CDEElementwiseOperation cde_element_op) = 0;

    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+#endif
+};
+
+// GEMM:
+//   input : A[M, K], B[K, N],
+//   input : D0[M, N], D1[M, N], ...
+//   output : E[M, N]
+//   C = a_op(A) * b_op(B)
+//   E = cde_op(C, D0, D1, ...)
+// Assume:
+//   D0, D1, ... and E have the same layout
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename ELayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename EDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmMultipleDSplitKBPreShuffle : public BaseOperator
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        std::array<const void*, NumDTensor> p_ds,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
+                        ck::index_t StrideE,
+                        ck::index_t KBatch,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+
+    virtual int GetPreShuffleParameters() = 0;
 };

 } // namespace device
--- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
@@ -28,8 +28,7 @@ enum struct GemmSpecialization
    NKOPadding,
    MNKOPadding,
 };
-
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 inline std::string getGemmSpecializationString(const GemmSpecialization& s)
 {
    switch(s)
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -3,8 +3,12 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <sstream>
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#endif

 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -15,8 +19,6 @@
 #include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -429,6 +431,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        matrix_padder.PadN,
        MaskOutUpperTriangle>;

+#ifndef __HIPCC_RTC__
    // Argument
    struct Argument : public BaseArgument
    {
@@ -603,6 +606,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
        }
    };
+#endif

    static constexpr bool IsValidCompilationParameter()
    {
@@ -610,6 +614,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
        return true;
    }

+#ifndef __HIPCC_RTC__
    static constexpr bool
    IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_, index_t Gemm1NRaw_)
    {
@@ -837,6 +842,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle

        return str.str();
    }
+#endif

    template <class ADesc, class BDesc, class B1Desc, class CDesc>
    struct Descriptor
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -3,8 +3,12 @@

 #pragma once

+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <sstream>
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#endif

 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -14,8 +18,6 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"

 namespace ck {

@@ -224,9 +226,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
    }

-    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
-                                         const std::array<index_t, NumDTensor>& NRaws,
-                                         const std::array<index_t, NumDTensor>& DsStride)
+    static auto MakeDsGridDescriptor_M_N(const Array<index_t, NumDTensor>& MRaws,
+                                         const Array<index_t, NumDTensor>& NRaws,
+                                         const Array<index_t, NumDTensor>& DsStride)
    {
        return generate_tuple(
            [&](auto i) {
@@ -308,6 +310,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
    using Block2ETileMap =
        remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;

+#ifndef __HIPCC_RTC__
    // Argument
    struct Argument : public BaseArgument
    {
@@ -497,6 +500,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        }
    };

+#endif
+
    static constexpr bool IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_)
    {
        // check vector load/store
@@ -577,6 +582,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
        return true;
    }

+#ifndef __HIPCC_RTC__
    static bool IsSupportedArgument(const Argument& arg)
    {
        if(!ck::is_xdl_supported())
@@ -675,11 +681,13 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
    {
        auto str = std::stringstream();

-        std::map<LoopScheduler, std::string> LoopSchedToString{
-            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+        std::map<LoopScheduler, std::string> LoopSchedToString{{LoopScheduler::Default, "Default"},
+                                                               { LoopScheduler::Interwave,
+                                                                 "Interwave" }};

        std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
-                                                                       {PipelineVersion::v2, "v2"}};
+                                                                       { PipelineVersion::v2,
+                                                                         "v2" }};

        // clang-format off
        str << "DeviceGemmMultipleD_Xdl_CShuffle"
@@ -708,6 +716,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,

        return str.str();
    }
+#endif

    template <class ADesc, class BDesc, class DsDesc, class EDesc>
    struct Descriptor
@@ -846,7 +855,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                               EDataType* __restrict__ p_e_grid)
    {
        __shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#ifndef __HIPCC_RTC__
        assert(desc.IsValid());
+#endif
        if(desc.has_main_k_block_loop)
        {
            GridwiseGemm::template Run<true>(p_a_grid,
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3.hpp
@@ -227,8 +227,20 @@ struct DeviceGemmMultiD_Xdl_CShuffle_V3 : public DeviceGemmMultipleDSplitK<ALayo
                }
            };

-            constexpr index_t minimum_occupancy =
-                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+            constexpr index_t minimum_occupancy = []() {
+                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
+                {
+                    return 2;
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
+                }
+                else
+                {
+                    return 1;
+                }
+            }();

            if(has_main_k_block_loop)
            {
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -0,0 +1,664 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp"
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename BLayout,
+          typename DsLayout,
+          typename CLayout,
+          typename ADataType,
+          typename BDataType,
+          typename DsDataType,
+          typename CDataType,
+          typename GemmAccDataType,
+          typename CShuffleDataType,
+          typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation,
+          GemmSpecialization GemmSpec,
+          index_t BlockSize,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t AK1,
+          index_t BK1,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MXdlPerWave,
+          index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_AK0_M_AK1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          index_t ABlockTransferSrcVectorDim,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t ABlockTransferDstScalarPerVector_AK1,
+          bool ABlockLdsExtraM,
+          typename BBlockTransferThreadClusterLengths_BK0_N_BK1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          index_t BBlockTransferSrcVectorDim,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t BBlockTransferDstScalarPerVector_BK1,
+          bool BBlockLdsExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename CDEShuffleBlockTransferScalarPerVectors,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = CDataType,
+          typename ComputeTypeB                       = ComputeTypeA,
+          typename LDSTypeA                           = ComputeTypeA,
+          typename LDSTypeB                           = ComputeTypeB>
+struct DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
+    : public DeviceGemmMultipleDSplitKBPreShuffle<ALayout,
+                                                  BLayout,
+                                                  DsLayout,
+                                                  CLayout,
+                                                  ADataType,
+                                                  BDataType,
+                                                  DsDataType,
+                                                  CDataType,
+                                                  AElementwiseOperation,
+                                                  BElementwiseOperation,
+                                                  CElementwiseOperation>
+{
+    static constexpr index_t NumDTensor = DsDataType::Size();
+
+    // GridwiseGemm
+    using GridwiseGemm = GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle<
+        ALayout,
+        BLayout,
+        DsLayout,
+        CLayout,
+        ADataType,
+        BDataType,
+        GemmAccDataType,
+        CShuffleDataType,
+        DsDataType,
+        CDataType,
+        AElementwiseOperation,
+        BElementwiseOperation,
+        CElementwiseOperation,
+        GemmSpec,
+        BlockSize,
+        MPerBlock,
+        NPerBlock,
+        KPerBlock,
+        AK1,
+        BK1,
+        MPerXDL,
+        NPerXDL,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadClusterLengths_AK0_M_AK1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_AK1,
+        false,
+        ABlockLdsExtraM,
+        BBlockTransferThreadClusterLengths_BK0_N_BK1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_BK1,
+        false,
+        BBlockLdsExtraN,
+        CShuffleMXdlPerWavePerShuffle,
+        CShuffleNXdlPerWavePerShuffle,
+        CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+        CDEShuffleBlockTransferScalarPerVectors,
+        BlkGemmPipeSched,
+        BlkGemmPipelineVer,
+        ComputeTypeA,
+        ComputeTypeB,
+        LDSTypeA,
+        LDSTypeB>;
+
+    using Argument = typename GridwiseGemm::Argument;
+
+    int GetPreShuffleParameters() override { return NPerXDL; }
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            if(stream_config.log_level_ > 0)
+            {
+                arg.Print();
+            }
+
+            if(!GridwiseGemm::CheckValidity(arg))
+            {
+                throw std::runtime_error("wrong! GridwiseGemm has invalid setting");
+            }
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(arg.M, arg.N, arg.KBatch);
+
+            float ave_time = 0;
+
+            index_t k_grain = arg.KBatch * KPerBlock;
+            index_t K_split = (arg.K + k_grain - 1) / k_grain * KPerBlock;
+
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+
+                    std::array<std::size_t, NumDTensor> DsSize;
+
+                    Argument arg_ = arg;
+
+                    const auto a_grid_desc_ak0_m_ak1 = GridwiseGemm::MakeAGridDescriptor_AK0_M_AK1(
+                        arg_.M, arg_.MPadded, arg_.K, arg_.KPadded, arg_.StrideA, arg_.AK0);
+                    const auto b_grid_desc_bk0_n_bk1 = GridwiseGemm::MakeBGridDescriptor_BK0_N_BK1(
+                        arg_.K, arg_.KPadded, arg_.N, arg_.NPadded, arg_.StrideB, arg_.BK0);
+
+                    auto size_a_buffer =
+                        a_grid_desc_ak0_m_ak1.GetElementSpaceSize() * sizeof(ADataType);
+                    auto size_b_buffer =
+                        b_grid_desc_bk0_n_bk1.GetElementSpaceSize() * sizeof(BDataType);
+
+                    const auto ds_grid_desc_m_n = GridwiseGemm::MakeDsGridDescriptor_M_N(
+                        arg_.M, arg_.MPadded, arg_.N, arg_.NPadded, arg_.StrideDs);
+
+                    static_for<0, NumDTensor, 1>{}([&](auto i) {
+                        using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;
+                        DsSize[i] = ds_grid_desc_m_n[i].GetElementSpaceSize() * sizeof(DDataType);
+                    });
+                    ck::utility::RotatingMemWrapperMultiD<Argument, DsDataType> rotating_mem(
+                        arg_, stream_config.rotating_count, size_a_buffer, size_b_buffer, DsSize);
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                        // clear c mem
+                        if(arg_.KBatch > 1)
+                            hipGetErrorString(hipMemsetAsync(arg_.p_c_grid,
+                                                             0,
+                                                             arg_.M * arg_.N * sizeof(CDataType),
+                                                             stream_config.stream_id_));
+                    };
+
+                    ave_time = ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        arg_);
+                }
+                else
+                {
+                    if(arg.KBatch > 1)
+                        hipGetErrorString(hipMemsetAsync(arg.p_c_grid,
+                                                         0,
+                                                         arg.M * arg.N * sizeof(CDataType),
+                                                         stream_config.stream_id_));
+
+                    ave_time = launch_and_time_kernel(
+                        stream_config, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, arg);
+                }
+            };
+
+            constexpr auto estimated_reg_a = MPerBlock * KPerBlock * sizeof(ADataType) / BlockSize /
+                                             4 * (1 + GridwiseGemm::NWave);
+            constexpr auto estimated_reg_b =
+                NPerBlock * KPerBlock * sizeof(BDataType) / BlockSize / 4 * (2);
+            constexpr auto estimated_reg_c =
+                MPerBlock * NPerBlock * sizeof(GemmAccDataType) / BlockSize / 4;
+            constexpr auto estimated_reg_total =
+                estimated_reg_a + estimated_reg_b + estimated_reg_c;
+
+            constexpr index_t minimum_occupancy = (estimated_reg_total >= 256) ? 1 : 2;
+
+            // static_assert(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3 &&
+            // has_main_k_block_loop, "only impl BlockGemmPipelineVersion::v3 and has mainloop right
+            // now");
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                                GridwiseGemm,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 ||
+                                  BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds<
+                                    GridwiseGemm,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("todo: only v1 v2 and v3 support now");
+                }
+            }
+#if 0
+            else
+            {
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+#if 0
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                                GridwiseGemm,
+                                false,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                                GridwiseGemm,
+                                false,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                                GridwiseGemm,
+                                false,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                                GridwiseGemm,
+                                false,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+#endif
+                    if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                    {
+                        const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                            GridwiseGemm,
+                            false,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy,
+                            TailNumber::Odd>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle<
+                            GridwiseGemm,
+                            false,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy,
+                            TailNumber::Even>;
+                        Run(kernel);
+                    }
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2 || BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds<
+                                    GridwiseGemm,
+                                    false,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds<
+                                    GridwiseGemm,
+                                    false,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds<
+                                    GridwiseGemm,
+                                    false,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel =
+                                kernel_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle_2lds<
+                                    GridwiseGemm,
+                                    false,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    throw std::runtime_error("todo: only v3 support now");
+                }
+            }
+#endif
+
+            return ave_time;
+        }
+
+        // polymorphic
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> && arg.KBatch > 1)
+        {
+            return false;
+        }
+
+        if((arg.K % AK1 != 0 || arg.K % BK1 != 0) && !(GemmSpec == GemmSpecialization::MKPadding ||
+                                                       GemmSpec == GemmSpecialization::NKPadding ||
+                                                       GemmSpec == GemmSpecialization::MNKPadding ||
+                                                       GemmSpec == GemmSpecialization::KPadding))
+        {
+            return false;
+        }
+
+        if(arg.N % NPerBlock != 0 || arg.K % KPerBlock != 0)
+        {
+            return false;
+        }
+
+        return GridwiseGemm::CheckValidity(arg);
+    }
+
+    // polymorphic
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto MakeArgument(const void* p_a,
+                             const void* p_b,
+                             std::array<const void*, NumDTensor> p_ds,
+                             void* p_c,
+                             index_t M,
+                             index_t N,
+                             index_t K,
+                             index_t StrideA,
+                             index_t StrideB,
+                             std::array<index_t, NumDTensor> StrideDs,
+                             index_t StrideC,
+                             index_t KBatch,
+                             AElementwiseOperation a_element_op,
+                             BElementwiseOperation b_element_op,
+                             CElementwiseOperation c_element_op)
+    {
+        return Argument{static_cast<const ADataType*>(p_a),
+                        static_cast<const BDataType*>(p_b),
+                        p_ds,
+                        static_cast<CDataType*>(p_c),
+                        M,
+                        N,
+                        K,
+                        StrideA,
+                        StrideB,
+                        StrideDs,
+                        StrideC,
+                        KBatch,
+                        a_element_op,
+                        b_element_op,
+                        c_element_op};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    // polymorphic
+    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                      const void* p_b,
+                                                      std::array<const void*, NumDTensor> p_ds,
+                                                      void* p_c,
+                                                      index_t M,
+                                                      index_t N,
+                                                      index_t K,
+                                                      index_t StrideA,
+                                                      index_t StrideB,
+                                                      std::array<ck::index_t, NumDTensor> StrideDs,
+                                                      index_t StrideC,
+                                                      index_t KBatch,
+                                                      AElementwiseOperation a_element_op,
+                                                      BElementwiseOperation b_element_op,
+                                                      CElementwiseOperation c_element_op) override
+    {
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+                                          static_cast<const BDataType*>(p_b),
+                                          p_ds,
+                                          static_cast<CDataType*>(p_c),
+                                          M,
+                                          N,
+                                          K,
+                                          StrideA,
+                                          StrideB,
+                                          StrideDs,
+                                          StrideC,
+                                          KBatch,
+                                          a_element_op,
+                                          b_element_op,
+                                          c_element_op);
+    }
+
+    // polymorphic
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        std::map<BlockGemmPipelineScheduler, std::string> BlkGemmPipelineSchedulerToString{
+            {BlockGemmPipelineScheduler::Intrawave, "Intrawave"},
+            {BlockGemmPipelineScheduler::Interwave, "Interwave"}};
+
+        std::map<BlockGemmPipelineVersion, std::string> BlkGemmPipelineVersionToString{
+            {BlockGemmPipelineVersion::v1, "v1"},
+            {BlockGemmPipelineVersion::v2, "v2"},
+            {BlockGemmPipelineVersion::v3, "v3"}};
+
+        // clang-format off
+        str << "DeviceGemmXdlUniversal"
+            << "<"
+            << getGemmSpecializationString(GemmSpec) << ", "
+            << std::string(ALayout::name)[0]
+            << std::string(BLayout::name)[0]
+            << std::string(CLayout::name)[0]
+            << ">"
+            << " BlkSize: "
+            << BlockSize << ", "
+            << "BlkTile: "
+            << MPerBlock<<"x"<<NPerBlock<<"x"<<KPerBlock << ", "
+            << "WaveTile: "
+            << MPerXDL<<"x"<<NPerXDL << ", "
+            << "WaveMap: "
+            << MXdlPerWave<<"x" << NXdlPerWave<<", "
+            << "VmemReadVec: "
+            << ABlockTransferSrcScalarPerVector<<"x"<<BBlockTransferSrcScalarPerVector<<", "
+            << "BlkGemmPipelineScheduler: "
+            << BlkGemmPipelineSchedulerToString[BlkGemmPipeSched] << ", "
+            << "BlkGemmPipelineVersion: "
+            << BlkGemmPipelineVersionToString[BlkGemmPipelineVer] << ", "
+            << "BlkGemmPipelinePrefetchStages: "
+            << GridwiseGemm::BlockwiseGemmPipe::PrefetchStages;
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_xdl_cshuffle_v3.hpp
@@ -210,8 +210,20 @@ struct DeviceGemm_Xdl_CShuffleV3 : public DeviceGemmV2<ALayout,
                }
            };

-            constexpr index_t minimum_occupancy =
-                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+            constexpr index_t minimum_occupancy = []() {
+                if constexpr(BlkGemmPipeSched == BlockGemmPipelineScheduler::Interwave)
+                {
+                    return 2;
+                }
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    return (MPerBlock * NPerBlock / BlockSize <= 128) ? 2 : 1;
+                }
+                else
+                {
+                    return 1;
+                }
+            }();

            if(has_main_k_block_loop)
            {
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -13,8 +13,10 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -138,8 +140,10 @@ template <ck::index_t NDimSpatial,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
-          typename ComputeTypeA = InDataType,
-          typename ComputeTypeB = ComputeTypeA>
+          typename ComputeTypeA                          = InDataType,
+          typename ComputeTypeB                          = ComputeTypeA,
+          index_t MaxTransposeTransferSrcScalarPerVector = 1,
+          index_t MaxTransposeTransferDstScalarPerVector = 1>
 struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
    : public DeviceGroupedConvBwdWeight<NDimSpatial,
                                        InLayout,
@@ -160,6 +164,11 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
    using BDataType = InDataType;
    using CDataType = WeiDataType;

+    // If NGCHW then ADataType must be equal to BDataType
+    static_assert(!(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                    is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
+                  is_same_v<ADataType, BDataType>);
+
    using AElementwiseOperation = OutElementwiseOperation;
    using BElementwiseOperation = InElementwiseOperation;
    using CElementwiseOperation = WeiElementwiseOperation;
@@ -279,6 +288,51 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;

+    static constexpr index_t ClusterLengthMPerBlock =
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(1);
+    static constexpr index_t ClusterLengthNPerBlock =
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(3);
+
+    static constexpr auto conv_ngchw_to_nhwgc_transformer =
+        TransformConvNGCHWToNHWGC<InLayout,
+                                  WeiLayout,
+                                  OutLayout,
+                                  NDimSpatial,
+                                  MPerBlock / ClusterLengthMPerBlock,
+                                  NPerBlock / ClusterLengthNPerBlock>{};
+
+    using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
+
+    static constexpr index_t TransposeTransferSrcScalarPerVectorAligned =
+        std::min(NPerBlock / ClusterLengthNPerBlock, MaxTransposeTransferSrcScalarPerVector);
+    static constexpr index_t TransposeTransferDstScalarPerVectorAligned =
+        std::min(MPerBlock / ClusterLengthMPerBlock, MaxTransposeTransferDstScalarPerVector);
+
+    using NGCHWTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeNGCHWTransposeDesc<NDimSpatial>({}, {}))>;
+    using NHWGCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
+
+    using GridwiseElementwiseTranspose =
+        GridwiseElementwise<Tuple<NGCHWTransposeDescType>,
+                            Tuple<NHWGCTransposeDescType>,
+                            Tuple<const ADataType*>,
+                            Tuple<ADataType*>,
+                            Block2TileMapElementwise,
+                            element_wise::PassThrough,
+                            BlockSize,
+                            MPerBlock,
+                            NPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<TransposeTransferSrcScalarPerVectorAligned>,
+                            Sequence<TransposeTransferDstScalarPerVectorAligned>,
+                            I1,
+                            I0>;
+
    using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
        BlockSize,
        ADataType,
@@ -398,6 +452,13 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                      end(a_g_n_k_wos_lengths),
                      begin(output_spatial_lengths_));

+            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeStrides(b_g_n_c_wis_lengths,
+                                                                 b_g_n_c_wis_strides);
+            std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeStrides(a_g_n_k_wos_lengths,
+                                                                 a_g_n_k_wos_strides);
+
            const auto descs =
                conv_to_gemm_transformer
                    .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
@@ -407,9 +468,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                        input_spatial_lengths_,
                        filter_spatial_lengths_,
                        output_spatial_lengths_,
-                        b_g_n_c_wis_strides,
+                        b_g_n_c_wis_strides_transposed,
                        e_g_k_c_xs_strides,
-                        a_g_n_k_wos_strides,
+                        a_g_n_k_wos_strides_transposed,
                        conv_filter_strides,
                        conv_filter_dilations,
                        input_left_pads,
@@ -424,8 +485,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);

            // A/B/C Batch Stride
-            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides_transposed[0];
            compute_ptr_offset_of_batch_.BatchStrideC_ =
                Conv_K_ * Conv_C_ *
                std::accumulate(begin(filter_spatial_lengths_),
@@ -441,6 +502,54 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                c_grid_desc_mblock_mperblock_nblock_nperblock_ =
                    GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
            }
+
+            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                a_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
+                        a_g_n_k_wos_lengths, a_g_n_k_wos_strides);
+                a_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
+                        a_g_n_k_wos_lengths, a_g_n_k_wos_strides);
+
+                b_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
+                        b_g_n_c_wis_lengths, b_g_n_c_wis_strides);
+                b_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
+                        b_g_n_c_wis_lengths, b_g_n_c_wis_strides);
+
+                elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{
+                    a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
+
+                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
+                    b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
+            }
+        }
+
+        std::size_t GetWorkspaceATensorSizeBytes() const
+        {
+            return sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize();
+        }
+
+        std::size_t GetWorkspaceBTensorSizeBytes() const
+        {
+            return sizeof(BDataType) * b_in_transpose_desc_.GetElementSpaceSize();
+        }
+
+        std::size_t GetWorkspaceSizeBytes() const
+        {
+            // Transpose require workspace for A and B
+            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes();
+            }
+            else
+            {
+                return 0;
+            }
        }

        const ADataType* p_a_grid_;
@@ -453,6 +562,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle

        Block2CTileMap block_2_ctile_map_;

+        Block2TileMapElementwise elementwise_block_2_ctile_map_transpose_a_,
+            elementwise_block_2_ctile_map_transpose_b_;
+
+        NGCHWTransposeDescType a_in_transpose_desc_, b_in_transpose_desc_;
+        NHWGCTransposeDescType a_out_transpose_desc_, b_out_transpose_desc_;
+
        // for computing batch offset
        ComputePtrOffsetOfStridedBatch<> compute_ptr_offset_of_batch_;

@@ -502,13 +617,57 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle

        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
        {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                            arg.b_grid_desc_kbatch_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            float avg_time = 0.f;
+
+            const ADataType* p_a_grid = arg.p_a_grid_;
+            const BDataType* p_b_grid = arg.p_b_grid_;
+
+            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
            {
-                throw std::runtime_error(
-                    "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+                const index_t grid_size_a =
+                    arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
+                        arg.a_in_transpose_desc_);
+                const index_t grid_size_b =
+                    arg.elementwise_block_2_ctile_map_transpose_b_.CalculateGridSize(
+                        arg.b_in_transpose_desc_);
+
+                p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+                ADataType* p_out_a_grid = type_convert<ADataType*>(arg.p_workspace_);
+                BDataType* p_out_b_grid = type_convert<BDataType*>(arg.p_workspace_) +
+                                          arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+
+                // Different data type for A and B is not supported
+                auto kernel_transpose = kernel_elementwise_dual<GridwiseElementwiseTranspose,
+                                                                ck::Tuple<NGCHWTransposeDescType>,
+                                                                ck::Tuple<NGCHWTransposeDescType>,
+                                                                ck::Tuple<NHWGCTransposeDescType>,
+                                                                ck::Tuple<NHWGCTransposeDescType>,
+                                                                ck::Tuple<const ADataType*>,
+                                                                ck::Tuple<ADataType*>,
+                                                                Block2TileMapElementwise,
+                                                                Block2TileMapElementwise,
+                                                                element_wise::PassThrough>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kernel_transpose,
+                                                   dim3(grid_size_a + grid_size_b),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   make_tuple(arg.a_in_transpose_desc_),
+                                                   make_tuple(arg.b_in_transpose_desc_),
+                                                   make_tuple(arg.a_out_transpose_desc_),
+                                                   make_tuple(arg.b_out_transpose_desc_),
+                                                   make_tuple(arg.p_a_grid_),
+                                                   make_tuple(arg.p_b_grid_),
+                                                   make_tuple(p_out_a_grid),
+                                                   make_tuple(p_out_b_grid),
+                                                   arg.elementwise_block_2_ctile_map_transpose_a_,
+                                                   arg.elementwise_block_2_ctile_map_transpose_b_,
+                                                   element_wise::PassThrough{},
+                                                   grid_size_a);
            }

            const index_t grid_size =
@@ -536,33 +695,35 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                    ComputePtrOffsetOfStridedBatch<>,
                    has_main_loop>;

-                return launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              arg.p_a_grid_,
-                                              arg.p_b_grid_,
-                                              arg.p_c_grid_,
-                                              arg.a_element_op_,
-                                              arg.b_element_op_,
-                                              arg.c_element_op_,
-                                              arg.Conv_G_,
-                                              arg.a_grid_desc_kbatch_k0_m_k1_,
-                                              arg.b_grid_desc_kbatch_k0_n_k1_,
-                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.block_2_ctile_map_,
-                                              arg.compute_ptr_offset_of_batch_);
+                avg_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           p_a_grid,
+                                           p_b_grid,
+                                           arg.p_c_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.Conv_G_,
+                                           arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_,
+                                           arg.compute_ptr_offset_of_batch_);
            };

            if(has_main_k0_block_loop)
            {
-                return launch_kernel(integral_constant<bool, true>{});
+                launch_kernel(integral_constant<bool, true>{});
            }
            else
            {
-                return launch_kernel(integral_constant<bool, false>{});
+                launch_kernel(integral_constant<bool, false>{});
            }
+            return avg_time;
        }

        float Run(const BaseArgument* p_arg,
@@ -598,7 +759,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        else if constexpr(NDimSpatial == 2)
        {
            if constexpr(!(is_NHWGC_GKYXC_NHWGK<InLayout, WeiLayout, OutLayout>() ||
-                           is_GNHWC_GKYXC_GNHWK<InLayout, WeiLayout, OutLayout>()))
+                           is_GNHWC_GKYXC_GNHWK<InLayout, WeiLayout, OutLayout>() ||
+                           is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>()))
            {
                return false;
            }
@@ -606,7 +768,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
        else if constexpr(NDimSpatial == 3)
        {
            if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK<InLayout, WeiLayout, OutLayout>() ||
-                           is_GNDHWC_GKZYXC_GNDHWK<InLayout, WeiLayout, OutLayout>()))
+                           is_GNDHWC_GKZYXC_GNDHWK<InLayout, WeiLayout, OutLayout>() ||
+                           is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()))
            {
                return false;
            }
@@ -644,6 +807,35 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
            return false;
        }

+        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                     is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+        {
+            if((arg.Conv_G_ * arg.Conv_C_) % TransposeTransferDstScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+
+            if((arg.Conv_G_ * arg.Conv_K_) % TransposeTransferDstScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+
+            const index_t input_spatial_acum = ck::accumulate_n<index_t>(
+                arg.input_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+            const index_t output_spatial_acum = ck::accumulate_n<index_t>(
+                arg.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+
+            if(input_spatial_acum % TransposeTransferSrcScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+
+            if(output_spatial_acum % TransposeTransferSrcScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+        }
+
        // Gridwise GEMM size
        return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
                                           arg.b_grid_desc_kbatch_k0_n_k1_,
@@ -764,12 +956,49 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
            << BBlockTransferDstScalarPerVector_K1 << ", "
            << CShuffleMXdlPerWavePerShuffle << ", "
            << CShuffleNXdlPerWavePerShuffle << ", "
-            << CBlockTransferScalarPerVector_NWaveNPerXdl
-            << ">";
+            << CBlockTransferScalarPerVector_NWaveNPerXdl;
+
+        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() || 
+                        is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
+                str << ", TransposeTransferSrcScalarPerVectorAligned: "
+                << TransposeTransferSrcScalarPerVectorAligned <<", "
+                << "TransposeTransferDstScalarPerVectorAligned: " << TransposeTransferDstScalarPerVectorAligned;
+            }
+
+            
+            str << ">";
        // clang-format on

        return str.str();
    }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        auto arg = dynamic_cast<const Argument*>(p_arg);
+        if(arg)
+        {
+            return arg->GetWorkspaceSizeBytes();
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedConvBwdWeight_Xdl_CShuffle::Argument structure!");
+    }
+
+    void SetWorkSpacePointer(BaseArgument* p_arg,
+                             void* p_workspace,
+                             const StreamConfig& = StreamConfig{}) const override
+    {
+        auto p_arg_ = dynamic_cast<Argument*>(p_arg);
+        if(p_arg_)
+        {
+            p_arg_->p_workspace_ = p_workspace;
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedConvBwdWeight_Xdl_CShuffle::Argument structure!");
+    }
 };

 } // namespace device
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
@@ -13,6 +13,7 @@ enum struct MaskingSpecialization
    MaskOutUpperTriangle
 };

+#ifndef __HIPCC_RTC__
 inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
 {
    switch(s)
@@ -22,6 +23,7 @@ inline std::string getMaskingSpecializationString(const MaskingSpecialization& s
    default: return "Unrecognized specialization!";
    }
 }
+#endif

 struct MaskDisabledPredicate
 {
@@ -53,7 +55,7 @@ struct MaskOutUpperTrianglePredicate
 template <typename MaskOutPredicate>
 struct C0MatrixMask_impl
 {
-    __host__ __device__ C0MatrixMask_impl(index_t NRaw)
+    __host__ __device__ constexpr C0MatrixMask_impl(index_t NRaw)
        : NRaw_(NRaw), predicate_(MaskOutPredicate{})
    {
    }
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -21,6 +21,12 @@ struct ColumnMajor : public BaseTensorLayout
 {
    static constexpr const char* name = "ColumnMajor";
 };
+
+struct MFMA : public BaseTensorLayout
+{
+    static constexpr const char* name = "MFMA";
+};
+
 } // namespace gemm

 namespace convolution {
@@ -430,7 +436,7 @@ struct G_NDHW : public BaseTensorLayout

 } // namespace convolution

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <
    typename Layout,
    typename std::enable_if<std::is_base_of<BaseTensorLayout, Layout>::value, bool>::type = false>
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -283,6 +283,16 @@ struct MultiplyMultiply
        e = ck::type_convert<ck::half_t>(x0_f);
    }

+    template <>
+    __host__ __device__ constexpr void operator()<ck::half_t, int, float, float>(
+        ck::half_t& e, const int& c, const float& d0, const float& d1) const
+    {
+        const float x0_f =
+            ck::type_convert<float>(c) * ck::type_convert<float>(d0) * ck::type_convert<float>(d1);
+
+        e = ck::type_convert<ck::half_t>(x0_f);
+    }
+
    template <>
    __host__ __device__ constexpr void operator()<ck::bhalf_t, int, float, float>(
        ck::bhalf_t& e, const int& c, const float& d0, const float& d1) const
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -284,6 +284,12 @@ struct PassThrough
        y = type_convert<half_t>(x);
    }

+    template <>
+    __host__ __device__ void operator()<half_t, int32_t>(half_t& y, const int32_t& x) const
+    {
+        y = type_convert<half_t>(x);
+    }
+
    template <>
    __host__ __device__ void operator()<bhalf_t, bhalf_t>(bhalf_t& y, const bhalf_t& x) const
    {
@@ -691,7 +697,7 @@ struct FastGelu

    template <typename Y, typename X>
    __device__ void operator()(Y& y, const X& x) const;
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
    template <>
    __host__ void operator()<float, float>(float& y, const float& x) const
    {
@@ -703,7 +709,6 @@ struct FastGelu
        y               = x / (1.f + emu);
    }
 #endif
-
    // device code, use lower precision "__ocml_exp_f32" and "rcp"
    template <>
    __device__ void operator()<float, float>(float& y, const float& x) const
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -8,7 +8,7 @@
 #include "ck/utility/tuple.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <limits>
 #include <stdlib.h>
 #endif
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -473,7 +473,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
        return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
    }

-#ifdef CK_CODE_GEN_RTC
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
    template <typename DsLayout, GemmSpecialization GemmSpec>
    __host__ __device__ static auto
    MakeDsGridDescriptor_M_N(const ck::Array<index_t, NumDTensor>& MRaws,
@@ -486,6 +486,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                             const std::array<index_t, NumDTensor>& NRaws,
                             const std::array<index_t, NumDTensor>& DsStride)
 #endif
+
    {
        return generate_tuple(
            [&](auto i) {
@@ -949,7 +950,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                               const index_t K,
                               const index_t StrideA,
                               const index_t StrideB,
-#ifdef CK_CODE_GEN_RTC
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
                               const ck::Array<index_t, NumDTensor> StrideDs,
 #else
                               const std::array<index_t, NumDTensor> StrideDs,
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -2,7 +2,8 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
-#ifndef CK_CODE_GEN_RTC
+
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <iostream>
 #include <ostream>
 #endif
@@ -54,7 +55,7 @@ constexpr auto GridwiseGemmPipeline_Selector()
    }
    else
    {
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
        std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl;
 #endif
    }
@@ -62,7 +63,7 @@ constexpr auto GridwiseGemmPipeline_Selector()

 } // namespace ck

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
 {
    switch(p)
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -230,6 +230,23 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
            }
        }();

+        // Pad both M and K to be multiples of the block sizes
+        const auto a_grid_desc_m_k =
+            transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                        make_tuple(make_right_pad_transform(M, MPad - M),
+                                                   make_right_pad_transform(K, KPad - K)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                       make_pass_through_transform(MPad)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+        return a_grid_desc_ak0_m_ak1;
+#if 0
        using GemmSpecialization = tensor_operation::device::GemmSpecialization;

        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
@@ -296,6 +313,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3

            return a_grid_desc_ak0_m_ak1;
        }
+#endif
    }

    __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
@@ -312,6 +330,23 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
            }
        }();

+        // Pad both N and K to be multiples of the block sizes
+        const auto b_grid_desc_n_k =
+            transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                        make_tuple(make_right_pad_transform(N, NPad - N),
+                                                   make_right_pad_transform(K, KPad - K)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+            b_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                       make_pass_through_transform(NPad)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+        return b_grid_desc_bk0_n_bk1;
+#if 0     
        using GemmSpecialization = tensor_operation::device::GemmSpecialization;

        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
@@ -378,6 +413,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3

            return b_grid_desc_bk0_n_bk1;
        }
+#endif
    }

    template <typename ABlockDesc_AK0_M_AK1>
@@ -412,6 +448,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
            }
        }();

+        // Pad both M and N to be multiples of the block sizes
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+#if 0
        using GemmSpecialization = tensor_operation::device::GemmSpecialization;

        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
@@ -449,6 +492,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
            // not pad M or N
            return c_grid_desc_mraw_nraw;
        }
+#endif
    }

    struct Problem
@@ -953,7 +997,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
        {
            if(!(karg.M % MPerBlock == 0))
            {
@@ -970,7 +1015,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
        if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                       GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
        {
            if(!(karg.N % NPerBlock == 0))
            {
@@ -1036,6 +1082,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
                              << __LINE__ << ", in function: " << __func__ << std::endl;
                }
+
                return false;
            }
        }
@@ -1051,6 +1098,10 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
                              << __LINE__ << ", in function: " << __func__ << std::endl;
                }
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
                return false;
            }
        }
@@ -1065,6 +1116,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
                              << __LINE__ << ", in function: " << __func__ << std::endl;
                }
+
                return false;
            }
        }
@@ -1082,6 +1134,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
                              << std::endl;
                }
+
                return false;
            }
        }
@@ -1098,17 +1151,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                              << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
                              << std::endl;
                }
-                return false;
-            }
-        }

-        if constexpr(is_same<remove_cvref_t<CDataType>, bhalf_t>::value)
-        {
-            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-            {
-                std::cout << " Grid size: " << karg.Grid_size << " > 1 is not support yet"
-                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                          << std::endl;
+                return false;
            }
        }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -674,11 +674,13 @@ struct GridwiseGemm_xdl_cshuffle_v3
    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
    {
        // A matrix in LDS memory, dst of blockwise copy
-        if constexpr(ABlockLdsExtraM)
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
            return make_naive_tensor_descriptor(
                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
-                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+                make_tuple(Number<MPerBlock>{} * AK1Number, AK1Number, I1));
        }
        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
        // in some cases.
@@ -810,11 +812,13 @@ struct GridwiseGemm_xdl_cshuffle_v3
    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
    {
        // B matrix in LDS memory, dst of blockwise copy
-        if constexpr(BBlockLdsExtraN)
+        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
            return make_naive_tensor_descriptor(
                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
-                make_tuple(BK1Number, Number<KPerBlock + BBlockLdsExtraN>{}, I1));
+                make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1Number, BK1Number, I1));
        }
        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
        {
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -676,11 +676,13 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
    __device__ static constexpr auto GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1()
    {
        // A matrix in LDS memory, dst of blockwise copy
-        if constexpr(ABlockLdsExtraM)
+        if constexpr(ABlockLdsExtraM || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
            return make_naive_tensor_descriptor(
                make_tuple(AK0Number, Number<MPerBlock>{}, AK1Number),
-                make_tuple(AK1Number, Number<KPerBlock + ABlockLdsExtraM>{}, I1));
+                make_tuple(Number<MPerBlock>{} * AK1Number, AK1Number, I1));
        }
        // xor tensor transformation request more unnecessary vgpr usage, would cause register spill
        // in some cases.
@@ -813,11 +815,13 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
    __device__ static constexpr auto GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1()
    {
        // B matrix in LDS memory, dst of blockwise copy
-        if constexpr(BBlockLdsExtraN)
+        if constexpr(BBlockLdsExtraN || BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
        {
+            // bank conflict when writting the data into LDS, but don't worry, we have whole entire
+            // loop to hide it in v4. it may give you some benefit from less valu in compute address
            return make_naive_tensor_descriptor(
                make_tuple(BK0Number, Number<NPerBlock>{}, BK1Number),
-                make_tuple(BK1Number, Number<KPerBlock + BBlockLdsExtraN>{}, I1));
+                make_tuple(Number<NPerBlock + BBlockLdsExtraN>{} * BK1Number, BK1Number, I1));
        }
        else if constexpr(is_same<tensor_layout::gemm::ColumnMajor, BLayout>::value)
        {
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -352,6 +352,14 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        }
    }

+    template <typename SeqIdx, index_t ThreadScratchId = 0>
+    __device__ constexpr auto
+    GetSrcThreadScratchIdx(Number<ThreadScratchId> thread_scratch_id = Number<ThreadScratchId>{})
+    {
+        using vector_t = typename vector_type_maker<SrcData, SrcScalarPerVector>::type::type;
+        return src_thread_scratch_tuple_(thread_scratch_id).template GetAsType<vector_t>(SeqIdx{});
+    }
+
    template <index_t ThreadScratchId>
    __device__ void
    TransferDataFromSrcThreadScratchToDstThreadScratch(Number<ThreadScratchId> thread_scratch_id)
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -780,7 +780,6 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
    }
 };

-// TODO: fix mfma...f8f6f4 instructions
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>
 {
@@ -847,9 +846,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
    // clang-format on

    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
-    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    __device__ void run(const FloatA& a,
+                        const int32_t scale_a,
+                        const FloatB& b,
+                        const int32_t scale_b,
+                        FloatC& reg_c) const
    {
-        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(
+            a, scale_a, b, scale_b, reg_c);
    }
 };

@@ -871,9 +875,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
    // clang-format on

    template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
-    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    __device__ void run(const FloatA& a,
+                        const int32_t scale_a,
+                        const FloatB& b,
+                        const int32_t scale_b,
+                        FloatC& reg_c) const
    {
-        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(
+            a, scale_a, b, scale_b, reg_c);
    }
 };

--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -34,6 +34,94 @@ struct TransformConvBwdWeightToGemmV2
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};

+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    constexpr static auto
+    make_out_grid_desc(const index_t N,
+                       const index_t Wo,
+                       const index_t K,
+                       const std::array<index_t, NDimSpatial + 3>& output_strides)
+    {
+        const index_t BatchStride = output_strides[0];
+        const index_t WoStride    = output_strides[3];
+        const auto KStride        = Number<1>{};
+        return make_naive_tensor_descriptor(make_tuple(N * Wo, NumGroupsToMerge, K),
+                                            make_tuple(WoStride, BatchStride, KStride));
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    constexpr static auto
+    make_in_grid_desc(const index_t N,
+                      const index_t Wi,
+                      const index_t C,
+                      const std::array<index_t, NDimSpatial + 3>& input_strides)
+    {
+        const index_t BatchStride = input_strides[0];
+        const index_t NStride     = input_strides[1];
+        const index_t WiStride    = input_strides[3];
+        const auto CStride        = input_strides[2];
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            return make_naive_tensor_descriptor(make_tuple(N * Wi, NumGroupsToMerge, C),
+                                                make_tuple(WiStride, BatchStride, CStride));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(N, Wi, NumGroupsToMerge, C),
+                make_tuple(NStride, WiStride, BatchStride, CStride));
+        }
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    constexpr static auto
+    make_wei_grid_desc(const index_t K,
+                       const index_t X,
+                       const index_t C,
+                       const std::array<index_t, NDimSpatial + 3>& weights_strides)
+    {
+        const auto CStride     = Number<1>{};
+        const auto KStride     = weights_strides[1];
+        const auto XStride     = weights_strides[3];
+        const auto BatchStride = weights_strides[0];
+        // Add NumGroupsToMerge for Batch+M dimension and, 1 as a placehorder
+        // for Batch+N dimension
+        const auto desc = make_naive_tensor_descriptor(
+            make_tuple(NumGroupsToMerge, K, X, 1, C),
+            make_tuple(BatchStride, KStride, XStride, BatchStride, CStride));
+        // Padd 1 to NumGroupsToMerge
+        const auto padded_desc = transform_tensor_descriptor(
+            desc,
+            make_tuple(make_pass_through_transform(NumGroupsToMerge),
+                       make_pass_through_transform(K),
+                       make_pass_through_transform(X),
+                       make_pad_transform(1, 0, NumGroupsToMerge - 1),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+        // We need only matrices from diagonal. Xor returns 0 for the same
+        // values. So if matrices is not on diagonal then it will be stored in padding.
+        // To avoid use of modulo after xor we assume that NumBatch to merge is power of 2.
+        static_assert(NumGroupsToMerge == 1 || NumGroupsToMerge == 2 || NumGroupsToMerge == 4 ||
+                      NumGroupsToMerge == 8 || NumGroupsToMerge == 16 || NumGroupsToMerge == 32 ||
+                      NumGroupsToMerge == 64);
+        const auto unmerged_padded_desc = transform_tensor_descriptor(
+            padded_desc,
+            make_tuple(make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                       make_pass_through_transform(K),
+                       make_pass_through_transform(X),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}),
+            make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}));
+        // Merge To M, N
+        return transform_tensor_descriptor(
+            unmerged_padded_desc,
+            make_tuple(make_merge_transform(make_tuple(NumGroupsToMerge, K)),
+                       make_merge_transform(make_tuple(X, NumGroupsToMerge, C))),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3, 4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
    template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
    constexpr static auto
    make_out_grid_desc(const index_t N,
@@ -221,6 +309,187 @@ struct TransformConvBwdWeightToGemmV2
            make_tuple(Sequence<0>{}, Sequence<1>{}));
    }

+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        const index_t N,
+        const index_t K,
+        const index_t C,
+        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+        const std::array<index_t, NDimSpatial + 3>& input_strides,
+        const std::array<index_t, NDimSpatial + 3>& weights_strides,
+        const std::array<index_t, NDimSpatial + 3>& output_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const index_t batch_k)
+    {
+        using namespace ck;
+
+        const index_t Wi = input_spatial_lengths[0];
+
+        const index_t Wo = output_spatial_lengths[0];
+
+        const index_t X = filter_spatial_lengths[0];
+
+        const index_t ConvStrideW = conv_filter_strides[0];
+
+        const index_t ConvDilationW = conv_filter_dilations[0];
+
+        const index_t InLeftPadW = input_left_pads[0];
+
+        const index_t InRightPadW = input_right_pads[0];
+
+        const index_t GemmKTotal = N * Wo;
+        const index_t GemmM      = K * NumGroupsToMerge;
+        const index_t GemmN      = C * X * NumGroupsToMerge;
+
+        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+
+        const auto out_grid_desc = make_out_grid_desc<NDim>(N, Wo, K, output_strides);
+        const auto in_grid_desc  = make_in_grid_desc<NDim>(N, Wi, C, input_strides);
+        const auto wei_grid_desc = make_wei_grid_desc<NDim>(K, X, C, weights_strides);
+
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(
+                    make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                    make_merge_transform(make_tuple(NumGroupsToMerge, GemmM / NumGroupsToMerge))),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: input tensor
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(
+                    make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                    make_merge_transform(make_tuple(NumGroupsToMerge, GemmN / NumGroupsToMerge))),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_grid_desc);
+        }
+        else
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(
+                    make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                    make_merge_transform(make_tuple(NumGroupsToMerge, GemmM / NumGroupsToMerge))),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(NumGroupsToMerge),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(NumGroupsToMerge),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_gemmktotal_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(X, NumGroupsToMerge, C)),
+                           make_merge_transform(make_tuple(N, Wo))),
+                make_tuple(Sequence<1, 3, 4>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+                              wei_gemmm_gemmn_pad_grid_desc);
+        }
+
+    } // function end
+
    template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
        const index_t N,
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -1008,6 +1008,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                index_t offset,
                                index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");

+#ifndef __HIPCC_RTC__
 template <typename T, index_t NumElemsPerThread>
 __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                              const index_t global_offset,
@@ -1059,5 +1060,6 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
        src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
 #endif
 }
+#endif

 } // namespace ck
--- a/include/ck/utility/amd_wave_read_first_lane.hpp
+++ b/include/ck/utility/amd_wave_read_first_lane.hpp
@@ -7,7 +7,7 @@
 #include "ck/utility/functional2.hpp"
 #include "ck/utility/math.hpp"

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <array>
 #include <cstddef>
 #include <cstdint>
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -533,9 +533,9 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                reg_c.template AsType<float16_t>()[Number<0>{}],
                0, // cbsz
                0, // blgp
-                0, // { OPSEL_HI[0], OPSEL[0] }?
+                0, // OPSEL
                scale_a,
-                0, // { OPSEL_HI[1], OPSEL[1] }?
+                0, // OPSEL
                scale_b);
 #else
        ignore = reg_a;
@@ -569,9 +569,9 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                reg_c.template AsType<float4_t>()[Number<0>{}],
                0, // cbsz
                0, // blgp
-                0, // { OPSEL_HI[0], OPSEL[0] }?
+                0, // OPSEL
                scale_a,
-                0, // { OPSEL_HI[1], OPSEL[1] }?
+                0, // OPSEL
                scale_b);
 #else
        ignore = reg_a;
--- a/include/ck/utility/blkgemmpipe_scheduler.hpp
+++ b/include/ck/utility/blkgemmpipe_scheduler.hpp
@@ -8,6 +8,19 @@

 namespace ck {

+enum struct BlockGemmPipelineVersion
+{
+    // For GEMM
+    v1, // Naive
+    v2, // Mem
+    v3, // Comp
+    v4, // Comp, double lds buffer
+    v5, // Comp, double global prefetch register buffer
+
+    // For GEMM with preshuffled weight
+    // v1, single lds buffer
+    // v2, double lds buffer
+};
 enum struct BlockGemmPipelineScheduler
 {
    Intrawave,
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -6,16 +6,20 @@
 #include "ck/utility/amd_ck_fp8.hpp"
 #include "ck/utility/e8m0.hpp"
 #include "ck/utility/statically_indexed_array.hpp"
-#ifdef CK_CODE_GEN_RTC
+
+/// Definitions from <cstdint>, <cmath> conflict with
+/// /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.
+
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
 using int8_t   = signed char;
 using uint8_t  = unsigned char;
 using int16_t  = signed short;
 using uint16_t = unsigned short;
 using float_t  = float;
-#endif
-namespace ck {
+#endif // __HIPCC_RTC__

-#ifdef CK_CODE_GEN_RTC
+namespace ck {
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 using byte = unsigned char;
 #else
 using std::byte;
@@ -2612,7 +2616,7 @@ using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
 using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type;
 using pk_i4x8_t = typename vector_type<pk_i4_t, 8>::type;

-#ifdef CK_CODE_GEN_RTC
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
 template <typename T>
 struct NumericLimits;

@@ -2825,6 +2829,118 @@ struct NumericLimits<bf8_ocp_t>
        return bit_cast<bf8_ocp_t>(binary_qnan);
    }
 };
+
+template <>
+struct NumericLimits<f4_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x2; // 0b0010
+    static constexpr uint8_t binary_max_normal    = 0x7; // 0b0111
+    static constexpr uint8_t binary_lowest_normal = 0xF; // 0b1111
+    static constexpr uint8_t binary_min_subnorm   = 0x1; // 0b0001
+    static constexpr uint8_t binary_max_subnorm   = 0x1; // 0b0001
+
+    static constexpr float data_max_normal_number    = 6;
+    static constexpr float data_min_subnormal_number = 0.5;
+
+    __host__ __device__ static constexpr f4_t Min() { return f4_t(binary_min_normal); }
+    __host__ __device__ static constexpr f4_t Max() { return f4_t(binary_max_normal); }
+    __host__ __device__ static constexpr f4_t Lowest() { return f4_t(binary_lowest_normal); }
+    __host__ __device__ static constexpr f4_t MinSubnorm() { return f4_t(binary_min_subnorm); }
+    __host__ __device__ static constexpr f4_t MaxSubnorm() { return f4_t(binary_max_subnorm); }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<f6_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
+    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
+    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
+    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
+    static constexpr uint8_t binary_max_subnorm   = 0x07; // 0b000111
+
+    static constexpr float data_max_normal_number    = 7.5;
+    static constexpr float data_min_subnormal_number = 0.125;
+
+    __host__ __device__ static constexpr f6_t Min() { return f6_t(binary_min_normal & 0b111111); }
+    __host__ __device__ static constexpr f6_t Max() { return f6_t(binary_max_normal & 0b111111); }
+    __host__ __device__ static constexpr f6_t Lowest()
+    {
+        return f6_t(binary_lowest_normal & 0b111111);
+    }
+    __host__ __device__ static constexpr f6_t MinSubnorm()
+    {
+        return f6_t(binary_min_subnorm & 0b111111);
+    }
+    __host__ __device__ static constexpr f6_t MaxSubnorm()
+    {
+        return f6_t(binary_max_subnorm & 0b111111);
+    }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<bf6_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
+    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
+    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
+    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
+    static constexpr uint8_t binary_max_subnorm   = 0x03; // 0b000011
+
+    static constexpr float data_max_normal_number    = 28;
+    static constexpr float data_min_subnormal_number = 0.0625;
+
+    __host__ __device__ static constexpr bf6_t Min() { return bf6_t(binary_min_normal); }
+    __host__ __device__ static constexpr bf6_t Max() { return bf6_t(binary_max_normal); }
+    __host__ __device__ static constexpr bf6_t Lowest() { return bf6_t(binary_lowest_normal); }
+    __host__ __device__ static constexpr bf6_t MinSubnorm() { return bf6_t(binary_min_subnorm); }
+    __host__ __device__ static constexpr bf6_t MaxSubnorm() { return bf6_t(binary_max_subnorm); }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<e8m0_bexp_t>
+{
+    static constexpr e8m0_bexp_t binary_min  = 0x00; // 0b00000000
+    static constexpr e8m0_bexp_t binary_max  = 0xFE; // 0b11111110
+    static constexpr e8m0_bexp_t binary_qnan = 0xFF; // 0b11111111
+    static constexpr e8m0_bexp_t binary_1    = 0x7F; // 0b01111111
+    static constexpr e8m0_bexp_t binary_2    = 0x80; // 0b10000000
+    static constexpr e8m0_bexp_t binary_3    = 0x82; // 0b10000010
+    static constexpr e8m0_bexp_t binary_135  = 0x87; // 0b10000111
+    static constexpr e8m0_bexp_t binary_142  = 0x8E; // 0b10001110
+
+    __host__ __device__ static constexpr e8m0_bexp_t Min() { return e8m0_bexp_t(binary_min); }
+    __host__ __device__ static constexpr e8m0_bexp_t Max() { return e8m0_bexp_t(binary_max); }
+    __host__ __device__ static constexpr e8m0_bexp_t QuietNaN() { return e8m0_bexp_t(binary_qnan); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_1() { return e8m0_bexp_t(binary_1); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_2() { return e8m0_bexp_t(binary_2); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_3() { return e8m0_bexp_t(binary_3); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_135()
+    {
+        return e8m0_bexp_t(binary_135);
+    }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_142()
+    {
+        return e8m0_bexp_t(binary_142);
+    }
+};
 #else
 template <typename T>
 struct NumericLimits
@@ -2959,7 +3075,6 @@ struct NumericLimits<bf8_ocp_t>
        return bit_cast<bf8_ocp_t>(binary_qnan);
    }
 };
-#endif

 template <>
 struct NumericLimits<f4_t>
@@ -3072,6 +3187,7 @@ struct NumericLimits<e8m0_bexp_t>
        return e8m0_bexp_t(binary_142);
    }
 };
+#endif

 template <typename T>
 struct NumericUtils
--- a/include/ck/utility/enable_if.hpp
+++ b/include/ck/utility/enable_if.hpp
@@ -4,15 +4,7 @@
 #pragma once

 namespace ck {
-
-#ifndef CK_CODE_GEN_RTC
-template <bool B, typename T = void>
-using enable_if = std::enable_if<B, T>;
-
-template <bool B, typename T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-
-#else
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
 template <bool B, class T = void>
 struct enable_if
 {
@@ -26,6 +18,12 @@ struct enable_if<true, T>

 template <bool B, class T = void>
 using enable_if_t = typename enable_if<B, T>::type;
-#endif

+#else
+template <bool B, typename T = void>
+using enable_if = std::enable_if<B, T>;
+
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+#endif
 } // namespace ck
--- a/include/ck/utility/loop_scheduler.hpp
+++ b/include/ck/utility/loop_scheduler.hpp
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

-#ifndef CK_CODE_GEN_RTC
+#pragma once
+
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <ostream>
 #endif

-#pragma once
-
 #include "ck/utility/common_header.hpp"

 namespace ck {
@@ -28,7 +28,7 @@ constexpr LoopScheduler make_default_loop_scheduler()

 } // namespace ck

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
 {
    switch(s)
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -4,6 +4,7 @@
 #pragma once

 #include "ck/ck.hpp"
+#include "data_type.hpp"
 #include "integral_constant.hpp"
 #include "number.hpp"
 #include "type.hpp"
@@ -34,7 +35,7 @@ struct MagicDivision
        // WARNING: magic division is only applicable for division inside this range.
        // You should use the return value of CalculateMagicNumbers, if division is not inside this
        // range. The "else" logic below is to quiet down run-time error.
-        if(divisor >= 1 && divisor <= INT32_MAX)
+        if(divisor >= 1 && divisor <= ck::NumericLimits<int32_t>::Max())
        {
            uint32_t shift = 0;
            for(shift = 0; shift < 32; ++shift)
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -19,7 +19,7 @@ extern "C" __device__ float __ocml_native_recip_f32(float);
 #endif

 // math functions for the host,  some are implemented by calling C++ std functions
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 static inline __host__ float abs(float x) { return std::abs(x); };

 static inline __host__ double abs(double x) { return std::abs(x); };
@@ -924,5 +924,23 @@ inline __device__ double expm1<double>(double x)
    return expm1(x);
 };

+template <typename T>
+inline __device__ T cos(T x)
+{
+    return ck::type_convert<T>(cosf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float cos<float>(float x)
+{
+    return cosf(x);
+};
+
+template <>
+inline __device__ double cos<double>(double x)
+{
+    return cos(x);
+};
+
 } // namespace math
 } // namespace ck
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -3,7 +3,7 @@

 #pragma once

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <ostream>
 #endif

@@ -902,7 +902,7 @@ using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;

 } // namespace ck

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <ck::index_t... Is>
 std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
 {
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -159,7 +159,7 @@ __host__ __device__ constexpr auto TupleReduce(F&& f, const Tuple<Ts...>& tuple)
    }
 }

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <typename T>
 using is_tuple = decltype(ck::declval<T&>().IsTuple());
 #endif
@@ -167,7 +167,7 @@ using is_tuple = decltype(ck::declval<T&>().IsTuple());
 template <typename... Ts>
 __host__ __device__ constexpr auto IsNestedTuple(const Tuple<Ts...>&)
 {
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
    return (is_detected<is_tuple, Ts>::value || ...);
 #endif
 }
--- a/include/ck/utility/type.hpp
+++ b/include/ck/utility/type.hpp
@@ -1,316 +1,313 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/ck.hpp"
-#include "ck/utility/enable_if.hpp"
-#include "ck/utility/integral_constant.hpp"
-
-namespace ck {
-#ifdef CK_CODE_GEN_RTC
-// NOLINTNEXTLINE
-#define CK_BUILTIN_TYPE_TRAIT1(name)         \
-    template <class T>                       \
-    struct name : bool_constant<__##name(T)> \
-    {                                        \
-    }
-
-// NOLINTNEXTLINE
-#define CK_BUILTIN_TYPE_TRAIT2(name)            \
-    template <class T, class U>                 \
-    struct name : bool_constant<__##name(T, U)> \
-    {                                           \
-    }
-
-// NOLINTNEXTLINE
-#define CK_BUILTIN_TYPE_TRAITN(name)             \
-    template <class... Ts>                       \
-    struct name : bool_constant<__##name(Ts...)> \
-    {                                            \
-    }
-
-CK_BUILTIN_TYPE_TRAIT1(is_class);
-CK_BUILTIN_TYPE_TRAIT1(is_pointer);
-CK_BUILTIN_TYPE_TRAIT1(is_reference);
-CK_BUILTIN_TYPE_TRAIT1(is_trivially_copyable);
-CK_BUILTIN_TYPE_TRAIT1(is_unsigned);
-CK_BUILTIN_TYPE_TRAIT2(is_base_of);
-
-template <class T>
-struct remove_cv
-{
-    using type = T;
-};
-
-template <class T>
-struct remove_cv<const T> : remove_cv<T>
-{
-};
-
-template <class T>
-struct remove_cv<volatile T> : remove_cv<T>
-{
-};
-
-template <class T>
-struct remove_reference
-{
-    typedef T type;
-};
-template <class T>
-struct remove_reference<T&>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_reference<T&&>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer<T*>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer<T* const>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer<T* volatile>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer<T* const volatile>
-{
-    typedef T type;
-};
-
-template <typename T>
-constexpr T&& forward(typename remove_reference<T>::type& t_) noexcept
-{
-    return static_cast<T&&>(t_);
-}
-template <typename T>
-constexpr T&& forward(typename remove_reference<T>::type&& t_) noexcept
-{
-    return static_cast<T&&>(t_);
-}
-
-template <class T>
-struct is_const : public integral_constant<bool, false>
-{
-};
-template <class T>
-struct is_const<const T> : public integral_constant<bool, true>
-{
-};
-template <class T>
-inline constexpr bool is_const_v = is_const<T>::value;
-
-template <typename T>
-inline constexpr bool is_reference_v = is_reference<T>::value;
-
-template <class T>
-struct remove_const
-{
-    typedef T type;
-};
-template <class T>
-struct remove_const<const T>
-{
-    typedef T type;
-};
-template <class T>
-using remove_const_t = typename remove_const<T>::type;
-template <class T>
-inline constexpr bool is_class_v = is_class<T>::value;
-
-template <class T>
-inline constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
-// template <typename T>
-// T&& declval() noexcept;
-
-template <class T, class U = T&&>
-U private_declval(int);
-
-template <class T>
-T private_declval(long);
-
-template <class T>
-auto declval() noexcept -> decltype(private_declval<T>(0));
-
-template <class...>
-using void_t = void;
-#else
-#include <utility>
-#include <type_traits>
-using std::declval;
-using std::forward;
-using std::is_base_of;
-using std::is_class;
-using std::is_class_v;
-using std::is_const_v;
-using std::is_pointer;
-using std::is_reference;
-using std::is_reference_v;
-using std::is_trivially_copyable;
-using std::is_trivially_copyable_v;
-using std::is_unsigned;
-using std::remove_const_t;
-using std::remove_cv;
-using std::remove_pointer;
-using std::remove_reference;
-using std::void_t;
-#endif
-
-template <typename X, typename Y>
-struct is_same : public integral_constant<bool, false>
-{
-};
-
-template <typename X>
-struct is_same<X, X> : public integral_constant<bool, true>
-{
-};
-
-template <typename X>
-struct is_floating_point : public integral_constant<bool, false>
-{
-};
-
-template <>
-struct is_floating_point<float> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_floating_point<double> : public integral_constant<bool, true>
-{
-};
-template <>
-struct is_floating_point<long double> : public integral_constant<bool, true>
-{
-};
-
-template <typename X>
-struct is_integral : public integral_constant<bool, false>
-{
-};
-
-template <>
-struct is_integral<int> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<unsigned int> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<long> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<unsigned long> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<short> : public integral_constant<bool, true>
-{
-};
-template <>
-struct is_integral<unsigned short> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<long long> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<unsigned long long> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<char> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<signed char> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<unsigned char> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<wchar_t> : public integral_constant<bool, true>
-{
-};
-template <>
-struct is_integral<char16_t> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<char32_t> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<bool> : public integral_constant<bool, true>
-{
-};
-
-template <typename X, typename Y>
-inline constexpr bool is_same_v = is_same<X, Y>::value;
-
-template <typename X, typename Y>
-inline constexpr bool is_base_of_v = is_base_of<X, Y>::value;
-
-template <typename T>
-inline constexpr bool is_unsigned_v = is_unsigned<T>::value;
-
-template <typename T>
-using remove_reference_t = typename remove_reference<T>::type;
-
-template <typename T>
-using remove_reference_t = typename remove_reference<T>::type;
-
-template <typename T>
-using remove_cv_t = typename remove_cv<T>::type;
-template <typename T>
-using remove_cvref_t = remove_cv_t<remove_reference_t<T>>;
-
-template <typename T>
-using remove_pointer_t = typename remove_pointer<T>::type;
-
-template <typename T>
-inline constexpr bool is_pointer_v = is_pointer<T>::value;
-
-template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
-__host__ __device__ constexpr Y bit_cast(const X& x)
-{
-    static_assert(__has_builtin(__builtin_bit_cast), "");
-    static_assert(sizeof(X) == sizeof(Y), "Do not support cast between different size of type");
-
-    return __builtin_bit_cast(Y, x);
-}
-} // namespace ck
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/enable_if.hpp"
+#include "ck/utility/integral_constant.hpp"
+
+namespace ck {
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
+// NOLINTNEXTLINE
+#define CK_BUILTIN_TYPE_TRAIT1(name)         \
+    template <class T>                       \
+    struct name : bool_constant<__##name(T)> \
+    {                                        \
+    }
+
+// NOLINTNEXTLINE
+#define CK_BUILTIN_TYPE_TRAIT2(name)            \
+    template <class T, class U>                 \
+    struct name : bool_constant<__##name(T, U)> \
+    {                                           \
+    }
+
+// NOLINTNEXTLINE
+#define CK_BUILTIN_TYPE_TRAITN(name)             \
+    template <class... Ts>                       \
+    struct name : bool_constant<__##name(Ts...)> \
+    {                                            \
+    }
+
+CK_BUILTIN_TYPE_TRAIT1(is_class);
+CK_BUILTIN_TYPE_TRAIT1(is_pointer);
+CK_BUILTIN_TYPE_TRAIT1(is_reference);
+CK_BUILTIN_TYPE_TRAIT1(is_trivially_copyable);
+CK_BUILTIN_TYPE_TRAIT1(is_unsigned);
+CK_BUILTIN_TYPE_TRAIT2(is_base_of);
+
+template <class T>
+struct remove_cv
+{
+    using type = T;
+};
+
+template <class T>
+struct remove_cv<const T> : remove_cv<T>
+{
+};
+
+template <class T>
+struct remove_cv<volatile T> : remove_cv<T>
+{
+};
+
+template <class T>
+struct remove_reference
+{
+    typedef T type;
+};
+template <class T>
+struct remove_reference<T&>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_reference<T&&>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T*>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T* const>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T* volatile>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T* const volatile>
+{
+    typedef T type;
+};
+
+template <typename T>
+constexpr T&& forward(typename remove_reference<T>::type& t_) noexcept
+{
+    return static_cast<T&&>(t_);
+}
+template <typename T>
+constexpr T&& forward(typename remove_reference<T>::type&& t_) noexcept
+{
+    return static_cast<T&&>(t_);
+}
+
+template <class T>
+struct is_const : public integral_constant<bool, false>
+{
+};
+template <class T>
+struct is_const<const T> : public integral_constant<bool, true>
+{
+};
+template <class T>
+inline constexpr bool is_const_v = is_const<T>::value;
+
+template <typename T>
+inline constexpr bool is_reference_v = is_reference<T>::value;
+
+template <class T>
+struct remove_const
+{
+    typedef T type;
+};
+template <class T>
+struct remove_const<const T>
+{
+    typedef T type;
+};
+template <class T>
+using remove_const_t = typename remove_const<T>::type;
+template <class T>
+inline constexpr bool is_class_v = is_class<T>::value;
+
+template <class T>
+inline constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
+// template <typename T>
+// T&& declval() noexcept;
+
+template <class T, class U = T&&>
+U private_declval(int);
+
+template <class T>
+T private_declval(long);
+
+template <class T>
+auto declval() noexcept -> decltype(private_declval<T>(0));
+
+template <class...>
+using void_t = void;
+#else
+#include <utility>
+#include <type_traits>
+using std::declval;
+using std::forward;
+using std::is_base_of;
+using std::is_class;
+using std::is_class_v;
+using std::is_const_v;
+using std::is_pointer;
+using std::is_reference;
+using std::is_reference_v;
+using std::is_trivially_copyable;
+using std::is_trivially_copyable_v;
+using std::is_unsigned;
+using std::remove_const_t;
+using std::remove_cv;
+using std::remove_pointer;
+using std::remove_reference;
+using std::void_t;
+#endif
+
+template <typename X, typename Y>
+struct is_same : public integral_constant<bool, false>
+{
+};
+
+template <typename X>
+struct is_same<X, X> : public integral_constant<bool, true>
+{
+};
+
+template <typename X>
+struct is_floating_point : public integral_constant<bool, false>
+{
+};
+
+template <>
+struct is_floating_point<float> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_floating_point<double> : public integral_constant<bool, true>
+{
+};
+template <>
+struct is_floating_point<long double> : public integral_constant<bool, true>
+{
+};
+
+template <typename X>
+struct is_integral : public integral_constant<bool, false>
+{
+};
+
+template <>
+struct is_integral<int> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<unsigned int> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<long> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<unsigned long> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<short> : public integral_constant<bool, true>
+{
+};
+template <>
+struct is_integral<unsigned short> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<long long> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<unsigned long long> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<char> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<signed char> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<unsigned char> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<wchar_t> : public integral_constant<bool, true>
+{
+};
+template <>
+struct is_integral<char16_t> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<char32_t> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<bool> : public integral_constant<bool, true>
+{
+};
+
+template <typename X, typename Y>
+inline constexpr bool is_same_v = is_same<X, Y>::value;
+
+template <typename X, typename Y>
+inline constexpr bool is_base_of_v = is_base_of<X, Y>::value;
+
+template <typename T>
+inline constexpr bool is_unsigned_v = is_unsigned<T>::value;
+
+template <typename T>
+using remove_reference_t = typename remove_reference<T>::type;
+
+template <typename T>
+using remove_cv_t = typename remove_cv<T>::type;
+template <typename T>
+using remove_cvref_t = remove_cv_t<remove_reference_t<T>>;
+
+template <typename T>
+using remove_pointer_t = typename remove_pointer<T>::type;
+
+template <typename T>
+inline constexpr bool is_pointer_v = is_pointer<T>::value;
+
+template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
+__host__ __device__ constexpr Y bit_cast(const X& x)
+{
+    static_assert(__has_builtin(__builtin_bit_cast), "");
+    static_assert(sizeof(X) == sizeof(Y), "Do not support cast between different size of type");
+
+    return __builtin_bit_cast(Y, x);
+}
+} // namespace ck
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -279,7 +279,6 @@ inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, half_t>(half_t x)
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 1254739;
-
 #ifndef CK_CODE_GEN_RTC
    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
@@ -344,7 +343,6 @@ inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, half_t>(half_t x
    constexpr bool clip              = true;
    constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
    constexpr int seed               = 1254739;
-
 #ifndef CK_CODE_GEN_RTC
    uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
@@ -1534,7 +1532,7 @@ inline __host__ __device__ float32_t type_convert<float32_t, bf6x32_t>(bf6x32_t
 #endif
 }

-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <typename Y, typename X, size_t NumElems>
 inline __host__ __device__ void array_convert(std::array<Y, NumElems>& y,
                                              const std::array<X, NumElems>& x)
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -1309,7 +1309,9 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
            (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, pk_int4_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)),
        "wrong! not implemented");

    using rtn_type = thread_buffer<T, N>;
--- a/include/ck_tile/core/container/array.hpp
+++ b/include/ck_tile/core/container/array.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -153,12 +153,12 @@ struct array<T, 0>
    CK_TILE_HOST_DEVICE void print() const { printf("array{size: 0, data: []}"); }
 };

-template <typename>
+template <typename, typename>
 struct vector_traits;

 // specialization for array
 template <typename T, index_t N>
-struct vector_traits<array<T, N>>
+struct vector_traits<array<T, N>, void>
 {
    using scalar_type                    = T;
    static constexpr index_t vector_size = N;
--- a/include/ck_tile/core/container/thread_buffer.hpp
+++ b/include/ck_tile/core/container/thread_buffer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -149,17 +149,24 @@ struct thread_buffer {
 };
 // clang-format on

-template <typename>
+template <typename, typename>
 struct vector_traits;

 // specialization for array
 template <typename T, index_t N>
-struct vector_traits<thread_buffer<T, N>>
+struct vector_traits<thread_buffer<T, N>, std::enable_if_t<!std::is_class_v<T>>>
 {
    using scalar_type                    = T;
    static constexpr index_t vector_size = N;
 };

+template <typename T, index_t N>
+struct vector_traits<thread_buffer<T, N>, std::enable_if_t<std::is_class_v<T>>>
+{
+    using scalar_type                    = typename T::type;
+    static constexpr index_t vector_size = N;
+};
+
 #endif

 } // namespace ck_tile
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -294,7 +294,7 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
 #undef TP_COM_
 };

-template <typename>
+template <typename, typename = void>
 struct vector_traits;

 // specialization for array
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -376,14 +376,12 @@ struct numeric<bfloat16_t>
    }
 };

-template <typename T>
-struct numeric_traits;
-
 template <>
 struct numeric_traits<bfloat16_t>
 {
-    static constexpr int exp  = 8;
-    static constexpr int mant = 7;
+    static constexpr int exp        = 8;
+    static constexpr int mant       = 7;
+    static constexpr int PackedSize = 1;
 };

 #if CK_TILE_USE_CUSTOM_DATA_TYPE
--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -207,9 +207,6 @@ using bf8_t     = unsigned _BitInt(8);
 using bf8_raw_t = uint8_t;
 #endif

-template <typename T>
-struct numeric_traits;
-
 template <>
 struct numeric_traits<fp8_t>
 {
@@ -225,6 +222,7 @@ struct numeric_traits<fp8_t>
    static constexpr fp8_interpretation f8_interpret = fp8_interpretation::E4M3_FNUZ;
 #endif
    static constexpr uint8_t abs_mask = 0x7F;
+    static constexpr int PackedSize   = 1;
 };

 template <>
@@ -242,6 +240,7 @@ struct numeric_traits<bf8_t>
    static constexpr fp8_interpretation f8_interpret = fp8_interpretation::E5M2_FNUZ;
 #endif
    static constexpr uint8_t abs_mask = 0x7F;
+    static constexpr int PackedSize   = 1;
 };

 // below is sw fp8 conversion, not utilizing hw instruction
--- a/include/ck_tile/core/numeric/half.hpp
+++ b/include/ck_tile/core/numeric/half.hpp
@@ -223,9 +223,6 @@ struct numeric<half_t>
    }
 };

-template <typename T>
-struct numeric_traits;
-
 template <>
 struct numeric_traits<half_t>
 {
@@ -241,6 +238,7 @@ struct numeric_traits<half_t>
    static constexpr uint16_t NegInf    = 0xFC00;
    static constexpr uint16_t NaN       = 0x7C01;
    static constexpr uint16_t Neg0      = 0x8000;
+    static constexpr int PackedSize     = 1;
    using bitwise_type                  = uint16_t;
 };

@@ -383,4 +381,24 @@ half_t exp2(half_t x) { return static_cast<half_t>(exp2f(static_cast<float>(x)))
 CK_TILE_DEVICE
 half_t log(half_t x) { return static_cast<half_t>(__logf(static_cast<float>(x))); };
 #endif
+
+using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
+
+CK_TILE_HOST fp16x2_t pk_add_f16(const fp16x2_t& x, const fp16x2_t& y)
+{
+    fp16x2_t vector_res;
+
+    vector_res.x = x.x + y.x;
+    vector_res.y = x.y + y.y;
+
+    return vector_res;
+}
+
+CK_TILE_DEVICE fp16x2_t pk_add_f16(const fp16x2_t& x, const fp16x2_t& y)
+{
+    fp16x2_t c;
+    asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(c) : "v"(x), "v"(y));
+    return c;
+}
+
 } // namespace ck_tile
--- a/include/ck_tile/core/numeric/int8.hpp
+++ b/include/ck_tile/core/numeric/int8.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/numeric/half.hpp"
@@ -74,8 +74,6 @@ struct numeric<int8_t>
 };

 #if 0
-template <typename T>
-struct numeric_traits;

 template <>
 struct numeric_traits<int8_t>
@@ -91,6 +89,7 @@ struct numeric_traits<int8_t>
    static constexpr uint32_t NegInf    = 0xFC00;
    static constexpr uint32_t NaN       = 0x7C01;
    static constexpr uint32_t Neg0      = 0x8000;
+    static constexpr int PackedSize           = 1;
    using bitwise_type                  = uint16_t;
 };
 #endif
--- a/include/ck_tile/core/numeric/numeric.hpp
+++ b/include/ck_tile/core/numeric/numeric.hpp
@@ -77,7 +77,10 @@ struct numeric
 };

 template <typename T>
-struct numeric_traits;
+struct numeric_traits
+{
+    static constexpr int PackedSize = 1;
+};

 template <>
 struct numeric_traits<float>
@@ -94,6 +97,7 @@ struct numeric_traits<float>
    static constexpr uint32_t NegInf    = 0xFF800000;
    static constexpr uint32_t NaN       = 0x7F800001;
    static constexpr uint32_t Neg0      = 0x80000000;
+    static constexpr int PackedSize     = 1;
    using bitwise_type                  = uint32_t;
 };

--- a/include/ck_tile/core/numeric/pk_int4.hpp
+++ b/include/ck_tile/core/numeric/pk_int4.hpp
@@ -21,8 +21,8 @@ struct pk_int4_t
 {
    using type = int8_t;
    type data;
-    __host__ __device__ constexpr pk_int4_t() : data{type{}} {}
-    __host__ __device__ constexpr pk_int4_t(type init) : data{init} {}
+    CK_TILE_HOST_DEVICE constexpr pk_int4_t() : data{type{}} {}
+    CK_TILE_HOST_DEVICE constexpr pk_int4_t(type init) : data{init} {}
 };

 // limits
@@ -91,6 +91,16 @@ struct numeric<pk_int4_t>
    CK_TILE_HOST_DEVICE static constexpr pk_int4_t zero() { return 0; }
 };

+template <>
+struct numeric_traits<pk_int4_t>
+{
+    static constexpr int PackedSize = 2;
+};
+
+using fp32x2_t = float __attribute__((ext_vector_type(2)));
+using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
+using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
+
 CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t(const pk_int4_t& x)
 {
    uint8_t x_u8 = ck_tile::bit_cast<uint8_t>(x);
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/numeric/pk_int4.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"

 namespace ck_tile {
@@ -30,17 +31,34 @@ struct native_t
 // of compiler errors e.g. struct A; using Ax2_t = A __attribute__((ext_vector_type(2)));  -> will
 // have compiler error
 namespace impl {
+
+template <typename T_, index_t N_, typename = void>
+struct ext_vector;
+
 template <typename T_, index_t N_>
-struct ext_vector
+struct ext_vector<T_, N_, std::enable_if_t<!std::is_class_v<typename native_t<T_>::type>>>
 {
    static constexpr index_t N = N_;
-    using value_type           = typename native_t<remove_cvref_t<T_>>::type;
+    // struct type is not supported for ext_vector
+    using value_type = typename native_t<T_>::type;
+    static_assert(!std::is_class_v<value_type>);
+    using type = value_type __attribute__((ext_vector_type(N))); // this is danguous
+};
+
+template <typename T_, index_t N_>
+struct ext_vector<T_, N_, std::enable_if_t<std::is_class_v<typename native_t<T_>::type>>>
+{
+    static constexpr index_t N = N_;
+    // struct type is not supported for ext_vector
+    using value_type = typename native_t<T_>::type::type;
    static_assert(!std::is_class_v<value_type>);
    using type = value_type __attribute__((ext_vector_type(N))); // this is danguous
 };

 template <typename V_, index_t Vs_, index_t N_>
-struct ext_vector<V_ __attribute__((ext_vector_type(Vs_))), N_>
+struct ext_vector<V_ __attribute__((ext_vector_type(Vs_))),
+                  N_,
+                  std::enable_if_t<!std::is_class_v<typename native_t<V_>::type>>>
 {
    static constexpr index_t N = Vs_ * N_;
    using value_type           = typename native_t<remove_cvref_t<V_>>::type;
@@ -48,6 +66,17 @@ struct ext_vector<V_ __attribute__((ext_vector_type(Vs_))), N_>
    using type = value_type __attribute__((ext_vector_type(N))); // this is danguous
 };

+template <typename V_, index_t Vs_, index_t N_>
+struct ext_vector<V_ __attribute__((ext_vector_type(Vs_))),
+                  N_,
+                  std::enable_if_t<std::is_class_v<typename native_t<V_>::type>>>
+{
+    static constexpr index_t N = Vs_ * N_;
+    using value_type           = typename native_t<remove_cvref_t<V_>>::type::type;
+    static_assert(!std::is_class_v<value_type>);
+    using type = value_type __attribute__((ext_vector_type(N))); // this is danguous
+};
+
 } // namespace impl

 template <typename T, index_t N>
@@ -55,10 +84,11 @@ using ext_vector_t = typename impl::ext_vector<T, N>::type;

 // by default, any type will result in a vector_size=1 with scalar_type=T traits.
 // ... unless we have other vector_traits specialization
-template <typename T>
+template <typename T, typename>
 struct vector_traits
 {
-    using scalar_type                    = remove_cvref_t<T>;
+    using scalar_type =
+        std::conditional_t<std::is_same_v<remove_cvref_t<T>, pk_int4_t>, int8_t, remove_cvref_t<T>>;
    static constexpr index_t vector_size = 1;
 };

@@ -66,7 +96,7 @@ struct vector_traits
 template <typename T, index_t N>
 struct vector_traits<T __attribute__((ext_vector_type(N)))>
 {
-    using scalar_type                    = T;
+    using scalar_type = std::conditional_t<std::is_same_v<T, pk_int4_t>, int8_t, T>;
    static constexpr index_t vector_size = N;
 };

@@ -200,21 +230,11 @@ using bf8x32_t = bf8_t __attribute((ext_vector_type(32)));
 using bf8x64_t = bf8_t __attribute((ext_vector_type(64)));
 #endif

-CK_TILE_HOST fp16x2_t pk_add_f16(const fp16x2_t& x, const fp16x2_t& y)
-{
-    fp16x2_t vector_res;
-
-    vector_res.x = x.x + y.x;
-    vector_res.y = x.y + y.y;
-
-    return vector_res;
-}
-
-CK_TILE_DEVICE fp16x2_t pk_add_f16(const fp16x2_t& x, const fp16x2_t& y)
-{
-    fp16x2_t c;
-    asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(c) : "v"(x), "v"(y));
-    return c;
-}
-
+// pk_int4_t
+// using pk_int4_t
+using pk_int4x2_t  = int8_t __attribute((ext_vector_type(2)));
+using pk_int4x4_t  = int8_t __attribute((ext_vector_type(4)));
+using pk_int4x8_t  = int8_t __attribute((ext_vector_type(8)));
+using pk_int4x16_t = int8_t __attribute((ext_vector_type(16)));
+using pk_int4x32_t = int8_t __attribute((ext_vector_type(32)));
 } // namespace ck_tile
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -231,13 +231,18 @@ struct buffer_view<address_space_enum::global,
    int32x4_t cached_buf_res_;
    remove_cvref_t<T> invalid_element_value_ = T{0};

+    static constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
+
    CK_TILE_HOST_DEVICE constexpr buffer_view()
        : p_data_{}, buffer_size_{}, cached_buf_res_{0}, invalid_element_value_{}
    {
    }

    CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
-        : p_data_{p_data}, buffer_size_{buffer_size}, cached_buf_res_{0}, invalid_element_value_{0}
+        : p_data_{p_data},
+          buffer_size_{buffer_size / PackedSize},
+          cached_buf_res_{0},
+          invalid_element_value_{0}
    {
    }

@@ -245,7 +250,7 @@ struct buffer_view<address_space_enum::global,
                                              BufferSizeType buffer_size,
                                              T invalid_element_value)
        : p_data_{p_data},
-          buffer_size_{buffer_size},
+          buffer_size_{buffer_size / PackedSize},
          cached_buf_res_{0},
          invalid_element_value_{invalid_element_value}
    {
@@ -255,7 +260,7 @@ struct buffer_view<address_space_enum::global,
    // Must call for buffers that need *_raw load/store
    CK_TILE_HOST_DEVICE void init_raw()
    {
-        cached_buf_res_ = make_wave_buffer_resource(p_data_, buffer_size_ * sizeof(type));
+        cached_buf_res_ = make_wave_buffer_resource(p_data_, (buffer_size_) * sizeof(type));
    }

    CK_TILE_DEVICE static constexpr address_space_enum get_address_space()
@@ -887,8 +892,8 @@ struct buffer_view<address_space_enum::lds,
 #endif

        i += linear_offset; // simplicity
-        if constexpr(std::is_same<typename vector_traits<remove_cvref_t<T>>::scalar_type,
-                                  int8_t>::value &&
+        if constexpr(std::is_same_v<typename vector_traits<remove_cvref_t<T>>::scalar_type,
+                                    int8_t> &&
                     workaround_int8_ds_write_issue)
        {
            if(is_valid_element)
@@ -897,83 +902,117 @@ struct buffer_view<address_space_enum::lds,
                // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
                // ds_write_b128
                // TODO: remove this after compiler fix
-                static_assert((std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                               std::is_same<remove_cvref_t<X>, int8_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x2_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x16_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x16_t>::value),
-                              "wrong! not implemented for this combination, please add "
-                              "implementation");
+                static_assert(
+                    (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                     std::is_same_v<remove_cvref_t<X>, int8_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x2_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8x4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8x8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8x16_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                        // ext_vector_type for pk_int4 must use int8_t as type
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 2>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4x4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4x8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4x16_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>),
+                    "wrong! not implemented for this combination, please add "
+                    "implementation");

-                if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                             std::is_same<remove_cvref_t<X>, int8_t>::value)
+                if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                              std::is_same_v<remove_cvref_t<X>, int8_t>) ||
+                             (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                              std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>))
                {
                    // HACK: cast pointer of x is bad
                    // TODO: remove this after compiler fix
                    *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
                        *c_style_pointer_cast<const int8_t*>(&x);
                }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x2_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x2_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 2>>))
                {
                    // HACK: cast pointer of x is bad
                    // TODO: remove this after compiler fix
                    *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
                        *c_style_pointer_cast<const int16_t*>(&x);
                }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x4_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>))
                {
                    // HACK: cast pointer of x is bad
                    // TODO: remove this after compiler fix
                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
                        *c_style_pointer_cast<const int32_t*>(&x);
                }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x8_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>))
                {
                    // HACK: cast pointer of x is bad
                    // TODO: remove this after compiler fix
                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
                        *c_style_pointer_cast<const int32x2_t*>(&x);
                }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x16_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>))
                {
                    // HACK: cast pointer of x is bad
                    // TODO: remove this after compiler fix
                    *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
                        *c_style_pointer_cast<const int32x4_t*>(&x);
                }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x4_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8x4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4x4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>))
                {
                    // HACK: cast pointer of x is bad
                    // TODO: remove this after compiler fix
                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
                        *c_style_pointer_cast<const int32_t*>(&x);
                }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x8_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8x8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4x8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>))
                {
                    // HACK: cast pointer of x is bad
                    // TODO: remove this after compiler fix
                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
                        *c_style_pointer_cast<const int32x2_t*>(&x);
                }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x16_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8x16_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4x16_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>))
                {
                    // HACK: cast pointer of x is bad
                    // TODO: remove this after compiler fix
--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -27,6 +27,8 @@ struct static_distributed_tensor

    using ThreadTensorDesc =
        remove_cvref_t<decltype(StaticTileDistribution{}.get_ys_to_d_descriptor())>;
+    static constexpr index_t PackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;

    static constexpr index_t kThreadElementSpaceSize = ThreadTensorDesc{}.get_element_space_size();
    static_assert(0 < kThreadElementSpaceSize, "Make sure tile distribution is valid");
@@ -59,7 +61,7 @@ struct static_distributed_tensor

    CK_TILE_HOST_DEVICE static constexpr index_t get_thread_buffer_size()
    {
-        return kThreadElementSpaceSize;
+        return kThreadElementSpaceSize / PackedSize;
    }

    template <index_t... YSliceOrigins, index_t... YSliceLengths>
@@ -79,8 +81,9 @@ struct static_distributed_tensor
        static_ford<sequence<YSliceLengths...>>{}([&](auto idx) {
            constexpr auto idx_ys = idx + sequence<YSliceOrigins...>{};

-            sliced_thread_data(number<sliced_thread_tensor_desc.calculate_offset(idx)>{}) =
-                thread_buf_[number<ThreadTensorDesc{}.calculate_offset(idx_ys)>{}];
+            sliced_thread_data(
+                number<sliced_thread_tensor_desc.calculate_offset(idx) / PackedSize>{}) =
+                thread_buf_[number<ThreadTensorDesc{}.calculate_offset(idx_ys) / PackedSize>{}];
        });

        return sliced_thread_data;
@@ -101,8 +104,9 @@ struct static_distributed_tensor
        static_ford<sequence<YSliceLengths...>>{}([&](auto idx) {
            constexpr auto idx_ys = idx + sequence<YSliceOrigins...>{};

-            thread_buf_(number<ThreadTensorDesc{}.calculate_offset(idx_ys)>{}) =
-                sliced_thread_data[number<sliced_thread_tensor_desc.calculate_offset(idx)>{}];
+            thread_buf_(number<ThreadTensorDesc{}.calculate_offset(idx_ys) / PackedSize>{}) =
+                sliced_thread_data[number<sliced_thread_tensor_desc.calculate_offset(idx) /
+                                          PackedSize>{}];
        });
    }

@@ -115,7 +119,7 @@ struct static_distributed_tensor
        constexpr auto y_idx = get_tile_distribution().get_y_indices_from_distributed_indices(
            TileDistributedIndices{});

-        return thread_buf_[number<ThreadTensorDesc{}.calculate_offset(y_idx)>{}];
+        return thread_buf_[number<ThreadTensorDesc{}.calculate_offset(y_idx) / PackedSize>{}];
    }

    template <typename TileDistributedIndices>
@@ -127,11 +131,11 @@ struct static_distributed_tensor
        constexpr auto y_idx = get_tile_distribution().get_y_indices_from_distributed_indices(
            TileDistributedIndices{});

-        return thread_buf_(number<ThreadTensorDesc{}.calculate_offset(y_idx)>{});
+        return thread_buf_(number<ThreadTensorDesc{}.calculate_offset(y_idx) / PackedSize>{});
    }

    //
-    thread_buffer<DataType, kThreadElementSpaceSize> thread_buf_;
+    thread_buffer<DataType, get_thread_buffer_size()> thread_buf_;
 };

 template <typename DataType, typename StaticTileDistribution>
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -45,6 +45,8 @@ struct tensor_view
    using TensorIndex = array<index_t, TensorDesc::get_num_of_top_dimension()>;
    using TensorCoord = decltype(make_tensor_coordinate(TensorDesc{}, TensorIndex{}));
    static constexpr auto DstInMemOp = DstInMemOp_;
+    static constexpr index_t PackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;

    CK_TILE_HOST_DEVICE constexpr tensor_view() = default;

@@ -81,8 +83,8 @@ struct tensor_view
                            bool_constant<oob_conditional_check> = {}) const
    {
        return buf_.template get<X>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            bool_constant<oob_conditional_check>{});
    }
@@ -99,8 +101,8 @@ struct tensor_view
                            bool is_valid_element, // flag
                            bool_constant<oob_conditional_check> = {}) const
    {
-        return buf_.template get<X>(coord.get_offset(),
-                                    linear_offset,
+        return buf_.template get<X>(coord.get_offset() / PackedSize,
+                                    linear_offset / PackedSize,
                                    is_valid_element,
                                    bool_constant<oob_conditional_check>{});
    }
@@ -122,8 +124,8 @@ struct tensor_view
    {
        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
            dst,
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            bool_constant<pre_nop>{});
    }
@@ -142,8 +144,12 @@ struct tensor_view
                                                         bool_constant<oob_conditional_check> = {},
                                                         bool_constant<pre_nop> = {}) const
    {
-        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
-            dst, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
+        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(dst,
+                                                                        coord.get_offset() /
+                                                                            PackedSize,
+                                                                        linear_offset / PackedSize,
+                                                                        is_valid_element,
+                                                                        bool_constant<pre_nop>{});
    }

    template <typename X,
@@ -159,8 +165,8 @@ struct tensor_view
    {
        return buf_.template async_get<X>(
            smem,
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            bool_constant<oob_conditional_check>{});
    }
@@ -178,8 +184,8 @@ struct tensor_view
                                  bool is_valid_element) const
    {
        return buf_.template async_get<X>(smem,
-                                          coord.get_offset(),
-                                          linear_offset,
+                                          coord.get_offset() / PackedSize,
+                                          linear_offset / PackedSize,
                                          is_valid_element,
                                          bool_constant<oob_conditional_check>{});
    }
@@ -198,8 +204,8 @@ struct tensor_view
    {
        return buf_.template async_get_raw<X>(
            smem,
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            bool_constant<pre_nop>{});
    }
@@ -217,8 +223,11 @@ struct tensor_view
                                      bool is_valid_element,
                                      bool_constant<pre_nop> = {}) const
    {
-        return buf_.template async_get_raw<X>(
-            smem, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
+        return buf_.template async_get_raw<X>(smem,
+                                              coord.get_offset() / PackedSize,
+                                              linear_offset / PackedSize,
+                                              is_valid_element,
+                                              bool_constant<pre_nop>{});
    }

    // X is vector of DataType.
@@ -236,8 +245,8 @@ struct tensor_view
                            bool_constant<oob_conditional_check> = {})
    {
        buf_.template set<X, oob_conditional_check>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }
@@ -272,8 +281,8 @@ struct tensor_view
                                bool_constant<oob_conditional_check> = {})
    {
        buf_.template set_raw<X, oob_conditional_check>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }
@@ -292,7 +301,7 @@ struct tensor_view
                                bool_constant<oob_conditional_check> = {})
    {
        buf_.template set_raw<X, oob_conditional_check>(
-            coord.get_offset(), linear_offset, is_valid_element, x);
+            coord.get_offset() / PackedSize, linear_offset / PackedSize, is_valid_element, x);
    }

    // X is vector of DataType.
@@ -310,8 +319,8 @@ struct tensor_view
                               bool_constant<oob_conditional_check> = {})
    {
        buf_.template update<DstInMemOp, X, oob_conditional_check>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }
@@ -330,7 +339,7 @@ struct tensor_view
                               bool_constant<oob_conditional_check> = {})
    {
        buf_.template update<DstInMemOp, X, oob_conditional_check>(
-            coord.get_offset(), linear_offset, is_valid_element, x);
+            coord.get_offset() / PackedSize, linear_offset / PackedSize, is_valid_element, x);
    }

    // X is vector of DataType.
@@ -350,8 +359,8 @@ struct tensor_view
                                   bool_constant<pre_nop>               = {})
    {
        buf_.template update_raw<DstInMemOp, X, oob_conditional_check, pre_nop>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
            coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
            x);
    }
@@ -372,7 +381,7 @@ struct tensor_view
                                   bool_constant<pre_nop>               = {})
    {
        buf_.template update_raw<DstInMemOp, X, oob_conditional_check, pre_nop>(
-            coord.get_offset(), linear_offset, is_valid_element, x);
+            coord.get_offset() / PackedSize, linear_offset / PackedSize, is_valid_element, x);
    }

    CK_TILE_HOST_DEVICE void print() const
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -97,13 +97,15 @@ struct tile_window_with_static_distribution
        }

        public:
+        static constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
        static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
        static constexpr index_t ScalarPerVector =
            get_vector_dim_y_scalar_per_vector().template at<1>();

        // using vector_type_t = vector_type_maker_t<DataType, ScalarPerVector>;
        // using vector_t      = typename vector_type_t::type;
-        using vector_t = thread_buffer<DataType, ScalarPerVector>;
+        using vector_t = thread_buffer<DataType, ScalarPerVector / PackedSize>;

        private:
        static constexpr auto scalars_per_access_ = [] {
@@ -336,7 +338,7 @@ struct tile_window_with_static_distribution
                        bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
 #if 1
                // write into distributed tensor
-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -345,10 +347,11 @@ struct tile_window_with_static_distribution
                        number<NDimY>{});

                    constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;

                    dst_tensor.get_thread_buffer().template at<d>() =
-                        vec_value.template get_as<DataType>()[j];
+                        vec_value.template get_as<DataType>()[j / Traits::PackedSize];
                });
 #else
                constexpr index_t d =
@@ -390,8 +393,9 @@ struct tile_window_with_static_distribution
        using SFC_Ys   = typename Traits::SFC_Ys;
        static constexpr index_t YElementSize =
            TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
-        static_assert(YElementSize % Traits::ScalarPerVector == 0);
-        using vectorized_tbuf = array<vector_t, YElementSize / Traits::ScalarPerVector>;
+        static_assert(YElementSize % (Traits::PackedSize * Traits::ScalarPerVector) == 0);
+        using vectorized_tbuf =
+            array<vector_t, YElementSize / (Traits::PackedSize * Traits::ScalarPerVector)>;
        // StaticBuffer<address_space_enum::vgpr,
        //                                      vector_t,
        //                                      YElementSize / Traits::ScalarPerVector,
@@ -419,7 +423,8 @@ struct tile_window_with_static_distribution
                // data index [y0, y1, ...]
                constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
                constexpr index_t d =
-                    tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+                    tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start) /
+                    Traits::PackedSize;
                static_assert(d % Traits::ScalarPerVector == 0);

                get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
@@ -632,7 +637,7 @@ struct tile_window_with_static_distribution
                // vector_type_t vec;
                vector_t vec_value;

-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -641,9 +646,10 @@ struct tile_window_with_static_distribution
                        number<NDimY>{});

                    constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;

-                    vec_value.template get_as<DataType>()(j) =
+                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
                        dstr_tensor.get_thread_buffer().template at<d>();
                });

@@ -698,7 +704,7 @@ struct tile_window_with_static_distribution

                // read from distributed tensor
                vector_t vec_value;
-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -706,8 +712,9 @@ struct tile_window_with_static_distribution
                        },
                        number<NDimY>{});
                    constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
-                    vec_value.template get_as<DataType>()(j) =
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;
+                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
                        dstr_tensor.get_thread_buffer().template at<d>();
                });

@@ -759,7 +766,7 @@ struct tile_window_with_static_distribution
                // read from distributed tensor
                vector_t vec_value;

-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -768,9 +775,10 @@ struct tile_window_with_static_distribution
                        number<NDimY>{});

                    constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;

-                    vec_value.template get_as<DataType>()(j) =
+                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
                        dstr_tensor.get_thread_buffer().template at<d>();
                });

@@ -825,7 +833,7 @@ struct tile_window_with_static_distribution
                // read from distributed tensor
                vector_t vec_value;

-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                    constexpr auto idx_ys = generate_tuple(
                        [&](auto jj) {
                            return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -834,9 +842,10 @@ struct tile_window_with_static_distribution
                        number<NDimY>{});

                    constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;

-                    vec_value.template get_as<DataType>()(j) =
+                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
                        dstr_tensor.get_thread_buffer().template at<d>();
                });

--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once
 #include "ck_tile/core/arch/arch.hpp"
@@ -151,11 +151,13 @@ struct tile_window_linear
        }

        public:
+        static constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
        static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
        static constexpr index_t ScalarPerVector =
            get_vector_dim_y_scalar_per_vector().template at<1>();

-        using vector_t = thread_buffer<DataType, ScalarPerVector>;
+        using vector_t = thread_buffer<DataType, ScalarPerVector / PackedSize>;

        private:
        static constexpr auto scalars_per_access_ = [] {
@@ -498,17 +500,18 @@ struct tile_window_linear
            // data index [y0, y1, ...]
            constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
            // write into distributed tensor
-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                constexpr auto idx_ys = generate_tuple(
                    [&](auto jj) {
                        return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
                    },
                    number<NDimY>{});

-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;

                dst_tensor.get_thread_buffer().template at<d>() =
-                    vec_value.template get_as<DataType>()[j];
+                    vec_value.template get_as<DataType>()[j / traits::PackedSize];
            });
 #else
            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
@@ -556,17 +559,18 @@ struct tile_window_linear
            // data index [y0, y1, ...]
            constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
            // write into distributed tensor
-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                constexpr auto idx_ys = generate_tuple(
                    [&](auto jj) {
                        return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
                    },
                    number<NDimY>{});

-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;

                dst_tensor.get_thread_buffer().template at<d>() =
-                    vec_value.template get_as<DataType>()[j];
+                    vec_value.template get_as<DataType>()[j / traits::PackedSize];
            });
 #else
            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
@@ -595,8 +599,9 @@ struct tile_window_linear
        using SFC_Ys   = typename traits::SFC_Ys;
        static constexpr index_t YElementSize =
            TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
-        static_assert(YElementSize % traits::ScalarPerVector == 0);
-        using vectorized_tbuf = array<vector_t, YElementSize / traits::ScalarPerVector>;
+        static_assert(YElementSize % (traits::PackedSize * traits::ScalarPerVector) == 0);
+        using vectorized_tbuf =
+            array<vector_t, YElementSize / (traits::PackedSize * traits::ScalarPerVector)>;

        constexpr auto tile_dstr = TileDstr{};

@@ -620,7 +625,9 @@ struct tile_window_linear

            // data index [y0, y1, ...]
            constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
-            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            constexpr index_t d =
+                tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start) /
+                traits::PackedSize;
            static_assert(d % traits::ScalarPerVector == 0);

            get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
@@ -804,16 +811,17 @@ struct tile_window_linear
            // read from distributed tensor
            vector_t vec_value;

-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                constexpr auto idx_ys = generate_tuple(
                    [&](auto jj) {
                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
                    },
                    number<NDimY>{});

-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;

-                vec_value.template get_as<DataType>()(j) =
+                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
                    dstr_tensor.get_thread_buffer().template at<d>();
            });

@@ -852,14 +860,15 @@ struct tile_window_linear

            // read from distributed tensor
            vector_t vec_value;
-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                constexpr auto idx_ys = generate_tuple(
                    [&](auto jj) {
                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
                    },
                    number<NDimY>{});
-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
-                vec_value.template get_as<DataType>()(j) =
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;
+                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
                    dstr_tensor.get_thread_buffer().template at<d>();
            });

@@ -897,16 +906,17 @@ struct tile_window_linear
            // read from distributed tensor
            vector_t vec_value;

-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                constexpr auto idx_ys = generate_tuple(
                    [&](auto jj) {
                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
                    },
                    number<NDimY>{});

-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;

-                vec_value.template get_as<DataType>()(j) =
+                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
                    dstr_tensor.get_thread_buffer().template at<d>();
            });

@@ -948,16 +958,17 @@ struct tile_window_linear
            // read from distributed tensor
            vector_t vec_value;

-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                constexpr auto idx_ys = generate_tuple(
                    [&](auto jj) {
                        return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
                    },
                    number<NDimY>{});

-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;

-                vec_value.template get_as<DataType>()(j) =
+                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
                    dstr_tensor.get_thread_buffer().template at<d>();
            });

--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
@@ -29,11 +29,12 @@ double get_relative_threshold(const int number_of_accumulations = 1)
    using I8   = int8_t;
    using I32  = int32_t;

-    static_assert(is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
-                  "Warning: Unhandled ComputeDataType for setting up the relative threshold!");
+    static_assert(
+        is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
+        "Warning: Unhandled ComputeDataType for setting up the relative threshold!");

    double compute_error = 0;
-    if constexpr(is_any_of<ComputeDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
    {
        return 0;
    }
@@ -42,11 +43,11 @@ double get_relative_threshold(const int number_of_accumulations = 1)
        compute_error = std::pow(2, -numeric_traits<ComputeDataType>::mant) * 0.5;
    }

-    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
                  "Warning: Unhandled OutDataType for setting up the relative threshold!");

    double output_error = 0;
-    if constexpr(is_any_of<OutDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
    {
        return 0;
    }
@@ -56,11 +57,11 @@ double get_relative_threshold(const int number_of_accumulations = 1)
    }
    double midway_error = std::max(compute_error, output_error);

-    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
                  "Warning: Unhandled AccDataType for setting up the relative threshold!");

    double acc_error = 0;
-    if constexpr(is_any_of<AccDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
    {
        return 0;
    }
@@ -82,12 +83,13 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
    using I8   = int8_t;
    using I32  = int32_t;

-    static_assert(is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
-                  "Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
+    static_assert(
+        is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
+        "Warning: Unhandled ComputeDataType for setting up the absolute threshold!");

    auto expo            = std::log2(std::abs(max_possible_num));
    double compute_error = 0;
-    if constexpr(is_any_of<ComputeDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
    {
        return 0;
    }
@@ -96,11 +98,11 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
        compute_error = std::pow(2, expo - numeric_traits<ComputeDataType>::mant) * 0.5;
    }

-    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
                  "Warning: Unhandled OutDataType for setting up the absolute threshold!");

    double output_error = 0;
-    if constexpr(is_any_of<OutDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
    {
        return 0;
    }
@@ -110,11 +112,11 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
    }
    double midway_error = std::max(compute_error, output_error);

-    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
                  "Warning: Unhandled AccDataType for setting up the absolute threshold!");

    double acc_error = 0;
-    if constexpr(is_any_of<AccDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
    {
        return 0;
    }
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -282,7 +282,14 @@ struct FillMonotonicSeq
    {
        std::generate(first, last, [=, n = init_value_]() mutable {
            auto tmp = n;
-            n += step_;
+            if constexpr(std::is_same_v<decltype(tmp), pk_int4_t>)
+            {
+                n.data += step_.data;
+            }
+            else
+            {
+                n += step_;
+            }
            return tmp;
        });
    }
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -281,18 +281,18 @@ struct HostTensor
    using Data       = std::vector<T>;

    template <typename X>
-    HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.get_element_space_size())
+    HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(get_element_space_size())
    {
    }

    template <typename X, typename Y>
    HostTensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
-        : mDesc(lens, strides), mData(mDesc.get_element_space_size())
+        : mDesc(lens, strides), mData(get_element_space_size())
    {
    }

    template <typename Lengths>
-    HostTensor(const Lengths& lens) : mDesc(lens), mData(mDesc.get_element_space_size())
+    HostTensor(const Lengths& lens) : mDesc(lens), mData(get_element_space_size())
    {
    }

@@ -302,7 +302,7 @@ struct HostTensor
    {
    }

-    HostTensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.get_element_space_size()) {}
+    HostTensor(const Descriptor& desc) : mDesc(desc), mData(get_element_space_size()) {}

    template <typename OutT>
    HostTensor<OutT> CopyAsType() const
@@ -340,7 +340,11 @@ struct HostTensor

    std::size_t get_element_size() const { return mDesc.get_element_size(); }

-    std::size_t get_element_space_size() const { return mDesc.get_element_space_size(); }
+    std::size_t get_element_space_size() const
+    {
+        constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
+        return mDesc.get_element_space_size() / PackedSize;
+    }

    std::size_t get_element_space_size_in_bytes() const
    {
@@ -463,29 +467,27 @@ struct HostTensor
    template <typename... Is>
    std::size_t GetOffsetFromMultiIndex(Is... is) const
    {
-        return mDesc.GetOffsetFromMultiIndex(is...);
+        constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
+        return mDesc.GetOffsetFromMultiIndex(is...) / PackedSize;
    }

    template <typename... Is>
    T& operator()(Is... is)
    {
-        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        return mData[GetOffsetFromMultiIndex(is...)];
    }

    template <typename... Is>
    const T& operator()(Is... is) const
    {
-        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        return mData[GetOffsetFromMultiIndex(is...)];
    }

-    T& operator()(std::vector<std::size_t> idx)
-    {
-        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
-    }
+    T& operator()(std::vector<std::size_t> idx) { return mData[GetOffsetFromMultiIndex(idx)]; }

    const T& operator()(std::vector<std::size_t> idx) const
    {
-        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        return mData[GetOffsetFromMultiIndex(idx)];
    }

    HostTensor<T> transpose(std::vector<size_t> axes = {}) const
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -34,11 +34,35 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,

        for(std::size_t k = 0; k < K; ++k)
        {
-            ADataType v_a = a_element_op(a_m_k(m, k));
-            BDataType v_b = b_element_op(b_k_n(k, n));
-
-            v_acc +=
-                ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
+            AccDataType v_a;
+            AccDataType v_b;
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                const pk_int4_t pk_val  = a_element_op(a_m_k(m, k));
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else
+            {
+                v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                const pk_int4_t pk_val  = b_element_op(b_k_n(k, n));
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+            }
+            else
+            {
+                v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
+            }
+            v_acc += v_a * v_b;
        }

        c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
@@ -73,6 +97,8 @@ __global__ void naive_gemm_kernel(ADataType* A,
        AccDataType acc = 0.0;
        for(int k = 0; k < K; ++k)
        {
+            constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
+            constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
            // Adjust indexing based on matrix layout
            int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
                              ? row * strideA + k
@@ -80,8 +106,34 @@ __global__ void naive_gemm_kernel(ADataType* A,
            int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
                              ? col * strideB + k
                              : k * strideB + col;
-            acc += ck_tile::type_convert<AccDataType>(A[a_index]) *
-                   ck_tile::type_convert<AccDataType>(B[b_index]);
+
+            AccDataType v_a;
+            AccDataType v_b;
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else
+            {
+                v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+            }
+            else
+            {
+                v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
+            }
+            acc += v_a * v_b;
        }

        int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_one_pass.hpp
@@ -157,7 +157,7 @@ struct AddRmsnorm2dRdquantFwdPipelineOnePass
        sweep_tile(qy, [&, yscale_ = yscale](auto idx) {
            constexpr auto i_idx = make_tuple(idx[number<0>{}]);
            auto qy_             = y[idx] / yscale_[i_idx];
-            qy(idx)              = saturates<QYDataType>{}(qy_);
+            qy(idx)              = type_convert<QYDataType>(saturates<QYDataType>{}(qy_));
        });
        store_tile(qy_window, qy);
    }
--- a/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
+++ b/include/ck_tile/ops/add_rmsnorm2d_rdquant/pipeline/add_rmsnorm2d_rdquant_fwd_pipeline_three_pass.hpp
@@ -260,7 +260,7 @@ struct AddRmsnorm2dRdquantFwdPipelineThreePass
                const auto x_ = type_convert<ComputeDataType>(x[idx]);
                auto y_       = x_ * inv_rms[i_idx] * gamma_;
                auto qy_      = y_ / yscale[i_idx];
-                qy(idx)       = saturates<QYDataType>{}(qy_);
+                qy(idx)       = type_convert<QYDataType>(saturates<QYDataType>{}(qy_));
            });

            store_tile(qy_window, qy);
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

@@ -9,20 +9,166 @@
 namespace ck_tile {
 namespace element_wise {

-#if 0
+// Fast int4x4 to fp16x8_t data type conversion based on paper
+// [Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production]
+// (https://arxiv.org/abs/2211.10017) and implementation:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+CK_TILE_DEVICE fp16x4_t i4_to_half4(int q)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+
+    int lo;
+    int hi;
+    // Extract the two int4 at low bit and create two fp16 number.
+    asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(lo) : "v"(q), "v"(LO), "v"(EX));
+    // Extract the two int4 at hight bit and create two fp16 number.
+    asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(hi) : "v"(q), "v"(HI), "v"(EX));
+
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}
+
+    fp16x4_t res;
+
+    // for two fp16 from lowbit, subtract 1032 to get correct fp16 value
+    asm volatile("v_pk_add_f16 %0, %1, %2"
+                 : "=v"(res.lo)
+                 : "v"(bit_cast<fp16x2_t>(lo)), "v"(bit_cast<fp16x2_t>(SUB)));
+
+    // for two fp16 from highbit, divide 16 and subtract 72 to get correct fp16 value
+    asm volatile(
+        "v_pk_fma_f16 %0, %1, %2, %3"
+        : "=v"(res.hi)
+        : "v"(bit_cast<fp16x2_t>(hi)), "v"(bit_cast<fp16x2_t>(MUL)), "v"(bit_cast<fp16x2_t>(ADD)));
+
+    return res;
+}
+
+CK_TILE_DEVICE fp16x4_t i4_to_half4_scale(int q, const fp16x2_t& scale)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+
+    int lo;
+    int hi;
+    // Extract the two int4 at low bit and create two fp16 number.
+    asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(lo) : "v"(q), "v"(LO), "v"(EX));
+    // Extract the two int4 at hight bit and create two fp16 number.
+    asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(hi) : "v"(q), "v"(HI), "v"(EX));
+
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}
+
+    fp16x4_t res;
+
+    asm volatile("v_pk_add_f16 %0, %1, %2"
+                 : "=v"(res.lo)
+                 : "v"(bit_cast<fp16x2_t>(lo)), "v"(bit_cast<fp16x2_t>(SUB)));
+
+    asm volatile(
+        "v_pk_fma_f16 %0, %1, %2, %3"
+        : "=v"(res.hi)
+        : "v"(bit_cast<fp16x2_t>(hi)), "v"(bit_cast<fp16x2_t>(MUL)), "v"(bit_cast<fp16x2_t>(ADD)));
+
+    asm volatile("v_pk_mul_f16 %0, %1, %2" : "=v"(res.lo) : "v"(res.lo), "v"(scale));
+
+    asm volatile("v_pk_mul_f16 %0, %1, %2" : "=v"(res.hi) : "v"(res.hi), "v"(scale));
+
+    return res;
+}
+
+CK_TILE_DEVICE bf16x4_t i4_to_bhalf4(int q)
+{
+    uint32_t i8s = (q & 0xf) | ((q & 0xf0) << 4) | ((q & 0xf00) << 8) | ((q & 0xf000) << 12);
+
+    static constexpr uint32_t fp32_base = 0x4B000000;
+
+    float fp32_intermediates[4];
+
+    uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+    fp32_intermediates_casted[0] = __byte_perm(i8s, fp32_base, 0x7650);
+    fp32_intermediates_casted[1] = __byte_perm(i8s, fp32_base, 0x7651);
+    fp32_intermediates_casted[2] = __byte_perm(i8s, fp32_base, 0x7652);
+    fp32_intermediates_casted[3] = __byte_perm(i8s, fp32_base, 0x7653);
+
+    fp32_intermediates[0] -= 8388616.f;
+    fp32_intermediates[1] -= 8388616.f;
+    fp32_intermediates[2] -= 8388616.f;
+    fp32_intermediates[3] -= 8388616.f;
+
+    bf16x4_t res;
+    res.lo = bit_cast<bf16x2_t>(
+        __byte_perm(fp32_intermediates_casted[1], fp32_intermediates_casted[0], 0x7632));
+    res.hi = bit_cast<bf16x2_t>(
+        __byte_perm(fp32_intermediates_casted[3], fp32_intermediates_casted[2], 0x7632));
+
+    return res;
+}
+
+struct PassThroughPack8
+{
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
+
+    CK_TILE_HOST_DEVICE constexpr void operator()(fp16x8_t& y, const pk_int4x4_t& x) const
+    {
+        y.lo = i4_to_half4(bit_cast<int>(x));
+        y.hi = i4_to_half4(bit_cast<int>(x) >> 8);
+    }
+
+    CK_TILE_HOST_DEVICE constexpr void operator()(bf16x8_t& y, const pk_int4x4_t& x) const
+    {
+        y.lo = i4_to_bhalf4(bit_cast<int>(x));
+        y.hi = i4_to_bhalf4(bit_cast<int>(x) >> 16);
+    }
+    constexpr const static bool is_pack8_invocable = true;
+};
+
+struct DequantPack8
+{
+    template <typename Y, typename X, typename Z>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x, const Z& z) const;
+
+    CK_TILE_HOST_DEVICE constexpr void
+    operator()(fp16x8_t& y, const pk_int4x4_t& x, const fp16x2_t& z) const
+    {
+        y.lo = i4_to_half4_scale(bit_cast<int>(x), z);
+        y.hi = i4_to_half4_scale(bit_cast<int>(x) >> 8, z);
+    }
+
+    constexpr const static bool is_pack8_invocable = true;
+};
+
 struct PassThroughPack2
 {
    template <typename Y, typename X>
    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;

-    CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::half2_t& y, const ck_tile::f8x2_t& x) const
+#if 0
+    CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::fp16x2_t& y, const ck_tile::f8x2_t& x) const
    {
        auto t = type_convert<float2_t>(x);
-        y      = type_convert<half2_t>(t);
+        y      = type_convert<fp16x2_t>(t);
    }
+#endif
+
+    CK_TILE_HOST_DEVICE constexpr void operator()(fp16x2_t& y, const pk_int4_t& x) const
+    {
+        uint8_t x_u8 = bit_cast<uint8_t>(x);
+        uint8_t x_l  = (x_u8 & 0x0f) >> 0;
+        uint8_t x_h  = (x_u8 & 0xf0) >> 4;
+
+        y.lo = type_convert<half_t>(x_l);
+        y.hi = type_convert<half_t>(x_h);
+    }
+
    constexpr const static bool is_pack2_invocable = true;
 };
-#endif

 struct PassThrough
 {
--- a/include/ck_tile/ops/fmha/block/block_masking.hpp
+++ b/include/ck_tile/ops/fmha/block/block_masking.hpp
@@ -310,7 +310,7 @@ struct SimplifiedGenericAttentionMask

        const index_t x_per_split = ck_tile::max(1, integer_divide_ceil(x_total, num_splits));
        const index_t split_start = x_per_split * i_split;
-        const index_t split_end   = split_start + x_per_split;
+        const index_t split_end   = ck_tile::min(x_total, split_start + x_per_split);

        return ck_tile::make_tuple(ck_tile::max(origin_start, split_start),
                                   ck_tile::min(origin_end, split_end));
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -742,7 +742,7 @@ struct FmhaFwdSplitKVKernel
                return pad_tensor_view(
                    v_dram_transposed,
                    make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, false>{});
+                    sequence<kPadHeadDimV, kPadSeqLenK>{});
            }
            else
            {
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.

 #pragma once

 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/elementwise.hpp"

 namespace ck_tile {

@@ -20,12 +21,13 @@ struct BlockUniversalGemmAsBsCr
    template <typename PipelineProblem_, typename GemmPolicy_>
    struct GemmTraits_
    {
-        using Problem        = remove_cvref_t<PipelineProblem_>;
-        using Policy         = remove_cvref_t<GemmPolicy_>;
-        using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-        using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-        using CDataType      = remove_cvref_t<typename Problem::CDataType>;
-        using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+        using Problem         = remove_cvref_t<PipelineProblem_>;
+        using Policy          = remove_cvref_t<GemmPolicy_>;
+        using ADataType       = remove_cvref_t<typename Problem::ADataType>;
+        using BDataType       = remove_cvref_t<typename Problem::BDataType>;
+        using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+        using CDataType       = remove_cvref_t<typename Problem::CDataType>;
+        using BlockGemmShape  = remove_cvref_t<typename Problem::BlockGemmShape>;

        static constexpr index_t kBlockSize = Problem::kBlockSize;
        static constexpr auto Scheduler     = Problem::Scheduler;
@@ -71,10 +73,10 @@ struct BlockUniversalGemmAsBsCr
        using BWarpTileDistr = remove_cvref_t<decltype(make_static_tile_distribution(
            typename WarpGemm::BWarpDstrEncoding{}))>;

-        using AWarpTile =
-            remove_cvref_t<decltype(make_static_distributed_tensor<ADataType>(AWarpTileDistr{}))>;
-        using BWarpTile =
-            remove_cvref_t<decltype(make_static_distributed_tensor<BDataType>(BWarpTileDistr{}))>;
+        using AWarpTile = remove_cvref_t<decltype(make_static_distributed_tensor<ComputeDataType>(
+            AWarpTileDistr{}))>;
+        using BWarpTile = remove_cvref_t<decltype(make_static_distributed_tensor<ComputeDataType>(
+            BWarpTileDistr{}))>;

        // TODO: Should we have two policies? Interwave & Intrawave ??
        static constexpr index_t InterWaveSchedulingMacClusters = 1;
@@ -90,9 +92,10 @@ struct BlockUniversalGemmAsBsCr
    public:
    using Traits = GemmTraits_<Problem_, Policy_>;

-    using ADataType = remove_cvref_t<typename Traits::ADataType>;
-    using BDataType = remove_cvref_t<typename Traits::BDataType>;
-    using CDataType = remove_cvref_t<typename Traits::CDataType>;
+    using ADataType       = remove_cvref_t<typename Traits::ADataType>;
+    using BDataType       = remove_cvref_t<typename Traits::BDataType>;
+    using ComputeDataType = remove_cvref_t<typename Traits::ComputeDataType>;
+    using CDataType       = remove_cvref_t<typename Traits::CDataType>;

    using WarpGemm = remove_cvref_t<typename Traits::WarpGemm>;

@@ -105,10 +108,34 @@ struct BlockUniversalGemmAsBsCr

    static constexpr auto Scheduler = Traits::Scheduler;

+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
    using I0 = number<0>;
    using I1 = number<1>;

    private:
+    template <typename WarpWindow, typename WarpTile>
+    CK_TILE_DEVICE static void load_interleaved_pk_type(const WarpWindow& warp_window,
+                                                        WarpTile& warp_tile)
+    {
+        constexpr index_t UnaryOpSize = 8;
+        const element_wise::PassThroughPack8 elementwise_op{};
+        constexpr index_t thread_buffer_size =
+            Traits::AWarpTile::get_thread_buffer_size() / UnaryOpSize;
+        const auto in_dstr_tensors = load_tile(warp_window);
+
+        static_assert(Traits::AWarpTile::get_thread_buffer_size() % UnaryOpSize == 0);
+
+        using ComputeVectorType = ComputeDataType __attribute__((ext_vector_type(UnaryOpSize)));
+        static_for<0, thread_buffer_size, 1>{}([&](auto i) {
+            elementwise_op(warp_tile.get_thread_buffer().template get_as<ComputeVectorType>()(i),
+                           in_dstr_tensors.get_thread_buffer().template get_as<pk_int4x4_t>()[i]);
+        });
+    }
+
    template <GemmPipelineScheduler Scheduler, typename GemmTraits>
    struct BlockGemmImpl
    {
@@ -208,6 +235,8 @@ struct BlockUniversalGemmAsBsCr
            });

            using CWarpDstr   = typename WarpGemm::CWarpDstr;
+            using AWarpTensor = typename WarpGemm::AWarpTensor;
+            using BWarpTensor = typename WarpGemm::BWarpTensor;
            using CWarpTensor = typename WarpGemm::CWarpTensor;

            constexpr auto c_warp_y_lengths =
@@ -217,10 +246,26 @@ struct BlockUniversalGemmAsBsCr
            // hot loop:
            static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    const auto a_warp_tile = load_tile(a_warp_windows(mIter)(kIter));
+                    AWarpTensor a_warp_tile;
+                    if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(a_warp_windows(mIter)(kIter), a_warp_tile);
+                    }
+                    else
+                    {
+                        a_warp_tile = load_tile(a_warp_windows(mIter)(kIter));
+                    }

                    static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        const auto b_warp_tile = load_tile(b_warp_windows(nIter)(kIter));
+                        BWarpTensor b_warp_tile;
+                        if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+                        {
+                            load_interleaved_pk_type(b_warp_windows(nIter)(kIter), b_warp_tile);
+                        }
+                        else
+                        {
+                            b_warp_tile = load_tile(b_warp_windows(nIter)(kIter));
+                        }

                        // read C warp tensor from C block tensor-
                        CWarpTensor c_warp_tensor;
@@ -342,11 +387,27 @@ struct BlockUniversalGemmAsBsCr
            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                    // read A warp tensor from A block window
-                    load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
+                    if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(a_warp_windows(mIter)(kIter),
+                                                 a_warp_tiles_(mIter)(kIter));
+                    }
+                    else
+                    {
+                        a_warp_tiles_(mIter)(kIter) = load_tile(a_warp_windows(mIter)(kIter));
+                    }
                });
                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                    // read B warp tensor from B Block window
-                    load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
+                    if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(b_warp_windows(nIter)(kIter),
+                                                 b_warp_tiles_(nIter)(kIter));
+                    }
+                    else
+                    {
+                        b_warp_tiles_(nIter)(kIter) = load_tile(b_warp_windows(nIter)(kIter));
+                    }
                });
            });
        }
@@ -504,12 +565,27 @@ struct BlockUniversalGemmAsBsCr
            // TODO check if a_warp_tiles has same desc as a_warp_window
            static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
                static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    // read A warp tensor from A block window
-                    load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
+                    if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(a_warp_windows(mIter)(kIter),
+                                                 a_warp_tiles_(mIter)(kIter));
+                    }
+                    else
+                    {
+                        a_warp_tiles_(mIter)(kIter) = load_tile(a_warp_windows(mIter)(kIter));
+                    }
                });
                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                    // read B warp tensor from B Block window
-                    load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
+                    if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(b_warp_windows(nIter)(kIter),
+                                                 b_warp_tiles_(nIter)(kIter));
+                    }
+                    else
+                    {
+                        b_warp_tiles_(nIter)(kIter) = load_tile(b_warp_windows(nIter)(kIter));
+                    }
                });
            });
        }
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -54,6 +54,11 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;

+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
    using ALayout = remove_cvref_t<typename Problem::ALayout>;
    using BLayout = remove_cvref_t<typename Problem::BLayout>;
    using CLayout = remove_cvref_t<typename Problem::CLayout>;
@@ -196,12 +201,12 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>

            // A/B split schedule
            // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
-            constexpr auto num_ds_read_inst_a = A_LDS_Read_Width * sizeof(ADataType) == 16
-                                                    ? A_LDS_Read_Inst_Num
-                                                    : A_LDS_Read_Inst_Num / 2;
-            constexpr auto num_ds_read_inst_b = B_LDS_Read_Width * sizeof(BDataType) == 16
-                                                    ? B_LDS_Read_Inst_Num
-                                                    : B_LDS_Read_Inst_Num / 2;
+            constexpr auto num_ds_read_inst_a =
+                A_LDS_Read_Width * sizeof(ADataType) / APackedSize == 16 ? A_LDS_Read_Inst_Num
+                                                                         : A_LDS_Read_Inst_Num / 2;
+            constexpr auto num_ds_read_inst_b =
+                B_LDS_Read_Width * sizeof(BDataType) / BPackedSize == 16 ? B_LDS_Read_Inst_Num
+                                                                         : B_LDS_Read_Inst_Num / 2;

            constexpr auto num_ds_write_inst_a = A_LDS_Write_Inst_Num;
            constexpr auto num_ds_write_inst_b = B_LDS_Write_Inst_Num;
@@ -213,9 +218,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>

            constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
            constexpr auto ds_read_a_issue_cycle =
-                A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+                A_LDS_Read_Width * sizeof(ADataType) / APackedSize == 16 ? 8 : 4;
            constexpr auto ds_read_b_issue_cycle =
-                B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+                B_LDS_Read_Width * sizeof(BDataType) / BPackedSize == 16 ? 8 : 4;
            constexpr auto ds_read_a_mfma_rate =
                (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
            constexpr auto ds_read_b_mfma_rate =
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -60,6 +60,13 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;

+    static_assert(!std::is_same_v<BDataType, pk_int4_t>, "Not implemented");
+
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
    using ALayout = remove_cvref_t<typename Problem::ALayout>;
    using BLayout = remove_cvref_t<typename Problem::BLayout>;
    using CLayout = remove_cvref_t<typename Problem::CLayout>;
@@ -139,12 +146,12 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                                                (BlockSize / WaveSize) /
                                                (MPerXDL * NPerXDL * KPerXDL);

-            constexpr auto num_ds_read_inst_a = A_LDS_Read_Width * sizeof(ADataType) == 16
-                                                    ? A_LDS_Read_Inst_Num
-                                                    : A_LDS_Read_Inst_Num / 2;
-            constexpr auto num_ds_read_inst_b = B_LDS_Read_Width * sizeof(BDataType) == 16
-                                                    ? B_LDS_Read_Inst_Num
-                                                    : B_LDS_Read_Inst_Num / 2;
+            constexpr auto num_ds_read_inst_a =
+                A_LDS_Read_Width * sizeof(ADataType) / APackedSize == 16 ? A_LDS_Read_Inst_Num
+                                                                         : A_LDS_Read_Inst_Num / 2;
+            constexpr auto num_ds_read_inst_b =
+                B_LDS_Read_Width * sizeof(BDataType) / BPackedSize == 16 ? B_LDS_Read_Inst_Num
+                                                                         : B_LDS_Read_Inst_Num / 2;

            constexpr auto num_ds_read_inst     = num_ds_read_inst_a + num_ds_read_inst_b;
            constexpr auto num_ds_write_inst    = A_LDS_Write_Inst_Num + B_LDS_Write_Inst_Num;
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -21,6 +21,13 @@ struct BaseGemmPipelineAgBgCrMem
    using BDataType      = remove_cvref_t<typename Problem::BDataType>;
    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;

+    static_assert(!std::is_same_v<BDataType, pk_int4_t>, "Not implemented");
+
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
    CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }

    static constexpr index_t BlockSize = Problem::kBlockSize;
@@ -33,9 +40,11 @@ struct BaseGemmPipelineAgBgCrMem

    static constexpr index_t WgpPerCU =
        (4 * get_warp_size() / BlockSize) >= 1 ? 4 * get_warp_size() / BlockSize : 1;
-    static constexpr index_t FullMemBandPrefetchStages = integer_divide_ceil(
-        MinMemInFlyBytes / WgpPerCU,
-        (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
+    static constexpr index_t FullMemBandPrefetchStages =
+        integer_divide_ceil(MinMemInFlyBytes / WgpPerCU,
+                            (MPerBlock * sizeof(ADataType) / APackedSize +
+                             NPerBlock * sizeof(BDataType) / BPackedSize) *
+                                KPerBlock);
    static constexpr index_t PrefetchStages =
        FullMemBandPrefetchStages >= 2
            ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -67,16 +67,22 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
    template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
    {
-        constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
-                                        MakeALdsBlockDescriptor<Problem>().get_element_space_size();
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<typename Problem::ADataType>>::PackedSize;
+        constexpr index_t smem_size_a =
+            sizeof(typename Problem::ADataType) *
+            MakeALdsBlockDescriptor<Problem>().get_element_space_size() / PackedSize;
        return smem_size_a;
    }

    template <typename Problem>
    CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
    {
-        constexpr index_t smem_size_b = sizeof(typename Problem::BDataType) *
-                                        MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<typename Problem::BDataType>>::PackedSize;
+        constexpr index_t smem_size_b =
+            sizeof(typename Problem::BDataType) *
+            MakeBLdsBlockDescriptor<Problem>().get_element_space_size() / PackedSize;
        return smem_size_b;
    }

@@ -387,8 +393,8 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
        using AccDataType     = float;
        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
+                                                typename Problem::ComputeDataType,
                                                AccDataType,
                                                WarpTile::at(I0),
                                                WarpTile::at(I1),
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -20,6 +20,11 @@ struct GemmPipelineAGmemBGmemCRegV2
    using CDataType      = remove_cvref_t<typename Problem::CDataType>;
    using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;

+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
    static constexpr index_t kBlockSize = Problem::kBlockSize;

    static constexpr index_t kMPerBlock = BlockGemmShape::kM;
@@ -37,13 +42,15 @@ struct GemmPipelineAGmemBGmemCRegV2

    CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize()
    {
-        return integer_divide_ceil(
-                   sizeof(ADataType) *
-                       Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
-                   16) *
+        return integer_divide_ceil(sizeof(ADataType) *
+                                       Policy::template MakeALdsBlockDescriptor<Problem>()
+                                           .get_element_space_size() /
+                                       APackedSize,
+                                   16) *
                   16 +
               sizeof(BDataType) *
-                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size() /
+                   BPackedSize;
    }

    template <typename ADramBlockWindowTmp,
@@ -75,7 +82,8 @@ struct GemmPipelineAGmemBGmemCRegV2
        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);

        constexpr index_t a_lds_block_space_size_aligned =
-            integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
+            integer_divide_ceil(
+                sizeof(ADataType) * a_lds_block_desc.get_element_space_size() / APackedSize, 16) *
            16;

        // B tile in LDS
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -13,14 +13,16 @@ template <typename ADataType_,
          typename BDataType_,
          typename CDataType_,
          typename BlockGemmShape_,
-          typename Traits_>
+          typename Traits_,
+          typename ComputeDataType_ = ADataType_>
 struct GemmPipelineProblemBase
 {
    using Traits = remove_cvref_t<Traits_>;

-    using ADataType = remove_cvref_t<ADataType_>;
-    using BDataType = remove_cvref_t<BDataType_>;
-    using CDataType = remove_cvref_t<CDataType_>;
+    using ADataType       = remove_cvref_t<ADataType_>;
+    using BDataType       = remove_cvref_t<BDataType_>;
+    using CDataType       = remove_cvref_t<CDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;

    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;

@@ -53,13 +55,15 @@ struct GemmPipelineProblemBase

    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentA()
    {
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
        if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
        {
            constexpr index_t pixels_per_thread =
                BlockGemmShape::kM * BlockGemmShape::kK / kBlockSize;
-            return pixels_per_thread < VectorLoadSize / sizeof(ADataType)
+            return pixels_per_thread < PackedSize * VectorLoadSize / sizeof(ADataType)
                       ? pixels_per_thread
-                       : VectorLoadSize / sizeof(ADataType);
+                       : PackedSize * VectorLoadSize / sizeof(ADataType);
        }
        else
        {
@@ -69,17 +73,19 @@ struct GemmPipelineProblemBase

    CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentB()
    {
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
        if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
        {
            constexpr index_t pixels_per_thread =
                BlockGemmShape::kN * BlockGemmShape::kK / kBlockSize;
-            return pixels_per_thread < VectorLoadSize / sizeof(BDataType)
+            return pixels_per_thread < PackedSize * VectorLoadSize / sizeof(BDataType)
                       ? pixels_per_thread
-                       : VectorLoadSize / sizeof(BDataType);
+                       : PackedSize * VectorLoadSize / sizeof(BDataType);
        }
        else
        {
-            return VectorLoadSize / sizeof(BDataType);
+            return PackedSize * VectorLoadSize / sizeof(BDataType);
        }
    }

@@ -143,9 +149,14 @@ template <typename ADataType_,
          typename BDataType_,
          typename CDataType_,
          typename BlockGemmShape_,
-          typename Traits_>
-using GemmPipelineProblem =
-    GemmPipelineProblemBase<ADataType_, BDataType_, CDataType_, BlockGemmShape_, Traits_>;
+          typename Traits_,
+          typename ComputeDataType_ = ADataType_>
+using GemmPipelineProblem = GemmPipelineProblemBase<ADataType_,
+                                                    BDataType_,
+                                                    CDataType_,
+                                                    BlockGemmShape_,
+                                                    Traits_,
+                                                    ComputeDataType_>;

 template <typename ADataType_,
          typename BDataType_,
@@ -154,14 +165,16 @@ template <typename ADataType_,
          typename Traits_,
          GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
          bool HasHotLoop_                 = true,
-          TailNumber TailNum_              = TailNumber::Full>
+          TailNumber TailNum_              = TailNumber::Full,
+          typename ComputeDataType_        = ADataType_>
 struct UniversalGemmPipelineProblem
 {
    using Traits = remove_cvref_t<Traits_>;

-    using ADataType = remove_cvref_t<ADataType_>;
-    using BDataType = remove_cvref_t<BDataType_>;
-    using CDataType = remove_cvref_t<CDataType_>;
+    using ADataType       = remove_cvref_t<ADataType_>;
+    using BDataType       = remove_cvref_t<BDataType_>;
+    using CDataType       = remove_cvref_t<CDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;

    using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;

--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -34,31 +34,41 @@ struct UniversalGemmBasePolicy
        constexpr index_t BlockSize           = Problem::kBlockSize;
        constexpr index_t KPerBlock           = Problem::BlockGemmShape::kK;
        constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize;
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;

        // Assume DataType is even!
-        if constexpr(XPerTile % (16 / sizeof(DataType)) == 0 &&
-                     elements_per_thread % (16 / sizeof(DataType)) == 0)
+        if constexpr(XPerTile % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     elements_per_thread % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     PackedSize == 2)
        {
-            return (16 / sizeof(DataType));
+            return (PackedSize * 32 / sizeof(DataType));
        }
-        else if constexpr(XPerTile % (8 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (8 / sizeof(DataType)) == 0)
+        else if constexpr(XPerTile % (PackedSize * 16 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 16 / sizeof(DataType)) == 0)
        {
-            return (8 / sizeof(DataType));
+            return (PackedSize * 16 / sizeof(DataType));
        }
-        else if constexpr(sizeof(DataType) >= 4 && XPerTile % (4 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (4 / sizeof(DataType)) == 0)
+        else if constexpr(XPerTile % (PackedSize * 8 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 8 / sizeof(DataType)) == 0)
        {
-            return (4 / sizeof(DataType));
+            return (PackedSize * 8 / sizeof(DataType));
        }
-        else if constexpr(sizeof(DataType) >= 2 && XPerTile % (2 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (2 / sizeof(DataType)) == 0)
+        else if constexpr(sizeof(DataType) >= PackedSize * 4 &&
+                          XPerTile % (PackedSize * 4 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 4 / sizeof(DataType)) == 0)
        {
-            return (2 / sizeof(DataType));
+            return (PackedSize * 4 / sizeof(DataType));
+        }
+        else if constexpr(sizeof(DataType) >= PackedSize * 2 &&
+                          XPerTile % (PackedSize * 2 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 2 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 2 / sizeof(DataType));
        }
        else
        {
-            return 1;
+            return PackedSize;
        }
    }

@@ -564,8 +574,8 @@ struct UniversalGemmPipelineAgBgCrPolicy
    {
        using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
        using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
+                                                typename Problem::ComputeDataType,
                                                typename Problem::CDataType,
                                                WarpTile::at(I0),
                                                WarpTile::at(I1),