diff --git a/.github/workflows/therock-ci-linux.yml b/.github/workflows/therock-ci-linux.yml
index f4d0c0063c..86d134e456 100644
--- a/.github/workflows/therock-ci-linux.yml
+++ b/.github/workflows/therock-ci-linux.yml
@@ -53,8 +53,8 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: "ROCm/TheRock"
-          ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit
           path: "TheRock"
+          ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit
 
       - name: Setup ccache
         run: |
@@ -77,6 +77,8 @@ jobs:
       - name: Patch rocm-libraries
         run: |
           git config --global --add safe.directory '*'
+          # Remove patches here if they cannot be applied cleanly, and they have not been deleted from TheRock repo
+          rm -f ./TheRock/patches/amd-mainline/rocm-libraries/0008-Revert-remove-options-no-enumerate-966.patch
           git -c user.name="therockbot" -c "user.email=therockbot@amd.com" am --whitespace=nowarn ./TheRock/patches/amd-mainline/rocm-libraries/*.patch
 
       - name: Install python deps
@@ -128,7 +130,7 @@ jobs:
         run: |
           python3 TheRock/build_tools/github_actions/post_build_upload.py \
             --run-id ${{ github.run_id }} \
-            --amdgpu-family ${{ env.AMDGPU_FAMILIES }} \
+            --artifact-group ${{ env.AMDGPU_FAMILIES }} \
             --build-dir TheRock/build \
             --upload
 
diff --git a/.github/workflows/therock-test-component.yml b/.github/workflows/therock-test-component.yml
index 1ccc1d57bc..27eff4fdb0 100644
--- a/.github/workflows/therock-test-component.yml
+++ b/.github/workflows/therock-test-component.yml
@@ -51,13 +51,13 @@ jobs:
         uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
         with:
           repository: "ROCm/TheRock"
-          ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit
+          ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit
 
       - name: Run setup test environment workflow
         uses: './.github/actions/setup_test_environment'
         with:
           ARTIFACT_RUN_ID: ${{ env.ARTIFACT_RUN_ID }}
-          AMDGPU_FAMILIES: ${{ inputs.amdgpu_families }}
+          ARTIFACT_GROUP: ${{ inputs.amdgpu_families }}
           OUTPUT_ARTIFACTS_DIR: ${{ env.OUTPUT_ARTIFACTS_DIR }}
           VENV_DIR: ${{ env.VENV_DIR }}
           FETCH_ARTIFACT_ARGS: ${{ fromJSON(inputs.component).fetch_artifact_args }}
diff --git a/.github/workflows/therock-test-packages.yml b/.github/workflows/therock-test-packages.yml
index efb5a6b1a0..81632fce48 100644
--- a/.github/workflows/therock-test-packages.yml
+++ b/.github/workflows/therock-test-packages.yml
@@ -27,7 +27,7 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
         with:
           repository: "ROCm/TheRock"
-          ref: c2921b151b8285a1d29942aceb33cfe0fea77ac9 # 10-15-2025 commit
+          ref: f3f77a3161922df3eee006b888b439d75b2b4668 # 2025-10-29 commit
 
       - name: "Configuring CI options"
         env:
diff --git a/.gitignore b/.gitignore
index 6641e5bc58..2641a661d8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,6 +66,12 @@ docs/doxygen/xml
 cmake-build*/
 build*/
 
+# LSP configuration
+.clangd
+
+# User-defined CMake presets
+CMakeUserPresets.json
+
 # Python virtualenv
 .venv/
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 049da5637f..7b4990dba4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -683,6 +683,12 @@ if(NOT GPU_ARCHS AND USER_GPU_TARGETS AND NOT MIOPEN_REQ_LIBS_ONLY)
         PACKAGE_NAME examples
    )
    add_subdirectory(example)
+
+   add_subdirectory(tutorial)
+   rocm_package_setup_component(tutorials
+        LIBRARY_NAME composablekernel
+        PACKAGE_NAME tutorials
+   )
    add_subdirectory(tile_engine)
    if(BUILD_TESTING)
        add_subdirectory(test)
diff --git a/Jenkinsfile b/Jenkinsfile
index aa4045186e..80db99684a 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1836,10 +1836,11 @@ pipeline {
                     }
                     agent{ label rocmnode("gfx90a") }
                     environment{
-                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCMAKE_CXX_FLAGS=" -O3 " """
+                        setup_args = """ -DCMAKE_INSTALL_PREFIX=../install -DGPU_TARGETS="gfx90a" -DCK_CXX_STANDARD="17" -DCMAKE_CXX_FLAGS=" -O3 " """
                         execute_args = """ cd ../client_example && rm -rf build && mkdir build && cd build && \
                                            cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" \
                                            -DGPU_TARGETS="gfx90a" \
+                                           -DCK_CXX_STANDARD="17" \
                                            -DCMAKE_CXX_COMPILER="${build_compiler()}" \
                                            -DCMAKE_C_COMPILER=/opt/rocm/llvm/bin/clang \
                                            -DCMAKE_CXX_FLAGS=" -O3 " .. && make -j """
diff --git a/example/01_gemm/gemm_wmma_fp8_v3.cpp b/example/01_gemm/gemm_wmma_fp8_v3.cpp
index 0376820b7b..2f8eac113b 100644
--- a/example/01_gemm/gemm_wmma_fp8_v3.cpp
+++ b/example/01_gemm/gemm_wmma_fp8_v3.cpp
@@ -13,7 +13,7 @@ using CDataType        = ck::bhalf_t;
 using ComputeTypeA     = ck::f8_t;
 using ComputeTypeB     = ck::f8_t;
 
-using ALayout = Row;
+using ALayout = Col;
 using BLayout = Col;
 using CLayout = Row;
 
@@ -30,13 +30,13 @@ using DeviceGemmV2Instance = ck::tensor_operation::device::DeviceGemm_Wmma_CShuf
     PassThrough, PassThrough, PassThrough, GemmDefault,
     128,
     128, 64, 64,
-    8, 8,
+    16, 16, // AK1, BK1
     16, 16,
     4, 2,
+    S<4, 32, 1>, S<0, 2, 1>, S<0, 2, 1>,
+    1, 4, 16, 0,
     S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
-    2, 8, 8, 0,
-    S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>,
-    2, 8, 8, 0,
+    2, 16, 16, 0,
     1, 1, S<1, 32, 1, 4>, 8,
     ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1,
     ComputeTypeA, ComputeTypeB>;
diff --git a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp
index 63343df3a8..6f30bdaa73 100644
--- a/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp
+++ b/example/59_grouped_gemm_multi_ABD/grouped_gemm_multi_abd_xdl_fixed_nk_bias_bf16_i8.cpp
@@ -221,8 +221,8 @@ bool run_grouped_gemm(const ProblemSize& problem_size, const ExecutionConfig& co
         b0_tensors_device.emplace_back(std::make_unique<DeviceMem>(
             sizeof(B0DataType) * problem_size.Ns[i] * problem_size.Ks[i]));
 
-        b1_tensors_device.emplace_back(
-            std::make_unique<DeviceMem>(sizeof(B1DataType) * problem_size.Ns[i]));
+        b1_tensors_device.emplace_back(std::make_unique<DeviceMem>(
+            sizeof(B1DataType) * problem_size.Ns[i] * problem_size.Ks[i]));
 
         d0_tensors_device.emplace_back(
             std::make_unique<DeviceMem>(sizeof(D0DataType) * problem_size.Ns[i]));
diff --git a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp
index 1adf039b70..ebb73ca7e0 100644
--- a/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp
+++ b/example/67_gemm_microscaling/moe_gemm1_xdl_mx_fp4_bpreshuffle.cpp
@@ -181,7 +181,7 @@ constexpr ck::index_t ScaleBlockSize   = 32;                   // scaling block
 constexpr ck::index_t KPerBlock        = 256 / DataPackedSize; // 256 f4 = 128 fp4x2
 static constexpr ck::index_t Nswizzle  = false;
 static constexpr ck::index_t ActOP     = 0; // 0: gelu_and_mul, 1: silu_and_mul
-static constexpr ck::index_t MPerBlock = 128;
+static constexpr ck::index_t MPerBlock = 32;
 static constexpr bool MulRoutedWeight  = true;
 
 // clang-format off
@@ -190,10 +190,10 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceMoeGemmMXBPreShuffl
     A0DataType,  A1DataType,  B0DataType,  B1DataType,  DsDataType, EDataType, AccDataType, CShuffleDataType,
     AElementOp,  BElementOp, CDEElementOp, GemmSpec,   
     ScaleBlockSize,  256, 
-    MPerBlock,  64,  KPerBlock,
+    MPerBlock,  128,  KPerBlock,
     16,   16,
     16,   16,
-    4,    2,
+    2,    2,
     S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
     S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 1,
     2,    2,   S<1, 32, 1, 8>, S<8, 1, 1, 1>,
@@ -213,10 +213,10 @@ int main(int argc, char* argv[])
     ck::index_t sorted_size               = sorted_tile_num * MPerBlock;
     ck::index_t valid_size                = valid_tile_num * MPerBlock;
 
-    ck::index_t N       = 6144;
-    ck::index_t K       = 4096;
+    ck::index_t N       = 7168;
+    ck::index_t K       = 256;
     ck::index_t experts = 8;
-    ck::index_t tokens  = 832;
+    ck::index_t tokens  = 208;
     ck::index_t topk    = 2;
 
     if(argc == 1)
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index d5f164c40f..703ab810d8 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -309,8 +309,8 @@ int run_gemm_example_with_layouts(ck_tile::ArgParser& arg_parser,
 
     if(init_method == 0)
     {
-        ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_m_k);
-        ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_k_n);
+        ck_tile::FillUniformDistribution<ADataType>{-2.f, 2.f}(a_m_k);
+        ck_tile::FillUniformDistribution<BDataType>{-2.f, 2.f}(b_k_n);
     }
     else if(init_method == 1)
     {
diff --git a/example/ck_tile/20_grouped_convolution/conv_configs.hpp b/example/ck_tile/20_grouped_convolution/conv_configs.hpp
index c688215280..8a2a60a197 100644
--- a/example/ck_tile/20_grouped_convolution/conv_configs.hpp
+++ b/example/ck_tile/20_grouped_convolution/conv_configs.hpp
@@ -14,28 +14,14 @@
 
 struct ConvConfigBase
 {
-    static constexpr bool kPadM = true;
-    static constexpr bool kPadN = true;
-    static constexpr bool kPadK = true;
-
-    static constexpr bool PermuteA = false;
-    static constexpr bool PermuteB = false;
-
-    static constexpr bool TransposeC            = false;
-    static constexpr bool UseStructuredSparsity = false;
-
     static constexpr ck_tile::index_t VectorSizeA = 4;
     static constexpr ck_tile::index_t VectorSizeB = 8;
     static constexpr ck_tile::index_t VectorSizeC = 8;
 
-    static constexpr int kBlockPerCu                         = 1;
-    static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
-    static constexpr ck_tile::index_t TileParitionerM01      = 4;
+    static constexpr int kBlockPerCu                = 1;
     static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
     static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V3;
     static constexpr ck_tile::index_t NumWaveGroups = 1;
-    static constexpr bool Preshuffle                = false;
-    static constexpr bool TiledMMAPermuteN          = false;
 
     static constexpr ck_tile::index_t NumGroupsToMerge = 1;
 };
@@ -216,9 +202,9 @@ struct ConvConfigComputeV5 : public ConvConfigBase
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
     static constexpr ck_tile::index_t K_Warp_Tile = 16;
 
-    static constexpr bool DoubleSmemBuffer               = false;
-    static constexpr ck_tile::GemmPipeline Pipeline      = ck_tile::GemmPipeline::COMPUTE_V5;
-    static constexpr ck_tile::index_t NumWaNumWaveGroups = 2;
+    static constexpr bool DoubleSmemBuffer          = false;
+    static constexpr ck_tile::GemmPipeline Pipeline = ck_tile::GemmPipeline::COMPUTE_V5;
+    static constexpr ck_tile::index_t NumWaveGroups = 2;
 };
 
 template <typename PrecType>
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
index ad593b1418..8ea892a215 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data.cpp
@@ -14,7 +14,7 @@
 #include "grouped_convolution_backward_data_invoker.hpp"
 #include "run_grouped_convolution_bwd_data_example.inc"
 
-template <template <typename PrecType> typename GemmConfig>
+template <template <typename PrecType> typename ConvConfig>
 int run_grouped_conv_bwd_data_example(int argc, char* argv[])
 {
     using Invoker = GroupedConvolutionBackwardDataInvoker;
@@ -31,14 +31,14 @@ int run_grouped_conv_bwd_data_example(int argc, char* argv[])
     if(data_type == "fp16")
     {
         return run_grouped_conv_bwd_data_example_prec_type<Invoker,
-                                                           GemmConfig<ck_tile::half_t>,
+                                                           ConvConfig<ck_tile::half_t>,
                                                            ck_tile::half_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else if(data_type == "bf16")
     {
         return run_grouped_conv_bwd_data_example_prec_type<Invoker,
-                                                           GemmConfig<ck_tile::bf16_t>,
+                                                           ConvConfig<ck_tile::bf16_t>,
                                                            ck_tile::bf16_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp
index f6d20c3d3a..d19d3ac8ec 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_data_invoker.hpp
@@ -8,7 +8,7 @@ struct GroupedConvolutionBackwardDataInvoker
 {
 
     template <ck_tile::index_t NDimSpatial,
-              typename GemmConfig,
+              typename ConvConfig,
               typename InDataType,
               typename WeiDataType,
               typename AccDataType,
@@ -22,64 +22,59 @@ struct GroupedConvolutionBackwardDataInvoker
     static float grouped_conv_bwd_data(const ck_tile::GroupedConvBwdDataHostArgs& args,
                                        const ck_tile::stream_config& s)
     {
-        constexpr int kBlockPerCu = 1;
-
         // Implicit GEMM Traits
         using GemmShape = ck_tile::TileGemmShape<
-            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-            ck_tile::
-                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-            GemmConfig::PermuteA,
-            GemmConfig::PermuteB>;
+            ck_tile::sequence<ConvConfig::M_Tile, ConvConfig::N_Tile, ConvConfig::K_Tile>,
+            ck_tile::sequence<ConvConfig::M_Warp, ConvConfig::N_Warp, ConvConfig::K_Warp>,
+            ck_tile::sequence<ConvConfig::M_Warp_Tile,
+                              ConvConfig::N_Warp_Tile,
+                              ConvConfig::K_Warp_Tile>>;
 
-        constexpr ck_tile::index_t VectorSizeA = 8;
-        constexpr ck_tile::index_t VectorSizeB = 8;
-        constexpr ck_tile::index_t VectorSizeC = 8;
-
-        constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
-        using TilePartitioner =
-            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
-                                                       GemmConfig::TileParitionerGroupNum,
-                                                       GemmConfig::TileParitionerM01>;
+        constexpr auto ConvSpec     = ck_tile::ConvolutionSpecialization::Default;
         using GroupedConvTraitsType = ck_tile::GroupedConvTraits<NDimSpatial,
                                                                  ConvSpec,
                                                                  InLayout,
                                                                  WeiLayout,
                                                                  DsLayout,
                                                                  OutLayout,
-                                                                 VectorSizeA,
-                                                                 VectorSizeB,
-                                                                 VectorSizeC>;
+                                                                 ConvConfig::VectorSizeA,
+                                                                 ConvConfig::VectorSizeB,
+                                                                 ConvConfig::VectorSizeC>;
+
+        using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
+            GemmShape,
+            GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
+            GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
 
         using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
-            GemmConfig::kPadM,
-            GemmConfig::kPadN,
-            GemmConfig::kPadK,
-            GemmConfig::DoubleSmemBuffer,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::AsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::BsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::CLayout,
-            GemmConfig::TransposeC,
-            GemmConfig::UseStructuredSparsity,
-            false, // Persistent,
-            GemmConfig::NumWaveGroups>;
+            GroupedConvTraitsType::FixedGemmParams::kPadM,
+            GroupedConvTraitsType::FixedGemmParams::kPadN,
+            GroupedConvTraitsType::FixedGemmParams::kPadK,
+            ConvConfig::DoubleSmemBuffer,
+            typename GroupedConvTraitsType::AsLayoutBwdData,
+            typename GroupedConvTraitsType::BsLayoutBwdData,
+            typename GroupedConvTraitsType::CLayoutBwdData,
+            GroupedConvTraitsType::FixedGemmParams::TransposeC,
+            GroupedConvTraitsType::FixedGemmParams::UseStructuredSparsity,
+            GroupedConvTraitsType::FixedGemmParams::Persistent,
+            ConvConfig::NumWaveGroups>;
 
         using GemmPipelineProblem = ck_tile::GemmPipelineProblem<
             OutDataType,
             WeiDataType,
             AccDataType,
             GemmShape,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData,
+            typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<
+                ConvConfig::NumWaveGroups>,
             ck_tile::element_wise::PassThrough,
             ck_tile::element_wise::PassThrough,
             InDataType,
-            true,
-            VectorSizeA,
-            VectorSizeB>;
+            GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
 
         using BaseGemmPipeline = typename PipelineTypeTraits<
-            GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+            ConvConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
 
         const ck_tile::index_t gemm_k =
             args.K_ * std::accumulate(args.filter_spatial_lengths_.begin(),
@@ -87,102 +82,103 @@ struct GroupedConvolutionBackwardDataInvoker
                                       1,
                                       std::multiplies<ck_tile::index_t>());
 
-        const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
-        const ck_tile::index_t K_split     = (gemm_k + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+        const ck_tile::index_t k_grain     = args.k_batch * ConvConfig::K_Tile;
+        const ck_tile::index_t K_split     = (gemm_k + k_grain - 1) / k_grain * ConvConfig::K_Tile;
         const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
         const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
         const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
         float ave_time{0};
 
-        const auto Run =
-            [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
-                constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-                constexpr auto tail_number_v    = tail_number_.value;
-                constexpr auto scheduler        = GemmConfig::Scheduler;
-                constexpr auto memory_operation = memory_operation_.value;
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto scheduler        = ConvConfig::Scheduler;
+            constexpr auto memory_operation = memory_operation_.value;
 
-                using UniversalGemmProblem =
-                    ck_tile::UniversalGemmPipelineProblem<OutDataType,
-                                                          WeiDataType,
-                                                          AccDataType,
-                                                          GemmShape,
-                                                          GemmUniversalTraits,
-                                                          scheduler,
-                                                          has_hot_loop_v,
-                                                          tail_number_v,
-                                                          ck_tile::element_wise::PassThrough,
-                                                          ck_tile::element_wise::PassThrough,
-                                                          InDataType,
-                                                          true,
-                                                          VectorSizeA,
-                                                          VectorSizeB>;
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<
+                OutDataType,
+                WeiDataType,
+                AccDataType,
+                GemmShape,
+                GemmUniversalTraits,
+                scheduler,
+                has_hot_loop_v,
+                tail_number_v,
+                ck_tile::element_wise::PassThrough,
+                ck_tile::element_wise::PassThrough,
+                InDataType,
+                GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+                GroupedConvTraitsType::VectorSizeA,
+                GroupedConvTraitsType::VectorSizeB>;
 
-                using GemmPipeline = typename PipelineTypeTraits<
-                    GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+            using GemmPipeline = typename PipelineTypeTraits<
+                ConvConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
 
-                using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
-                    OutDataType,
-                    WeiDataType,
-                    DsDataType,
-                    AccDataType,
-                    InDataType,
-                    typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                    ck_tile::tensor_layout::gemm::RowMajor,
-                    CDEElementWise,
-                    TilePartitioner::MPerBlock,
-                    TilePartitioner::NPerBlock,
-                    GemmConfig::M_Warp,
-                    GemmConfig::N_Warp,
-                    GemmConfig::M_Warp_Tile,
-                    GemmConfig::N_Warp_Tile,
-                    GemmConfig::K_Warp_Tile,
-                    GemmConfig::TransposeC,
-                    memory_operation,
-                    1,
-                    true,
-                    GroupedConvTraitsType::VectorSizeC>>;
+            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                OutDataType,
+                WeiDataType,
+                DsDataType,
+                AccDataType,
+                InDataType,
+                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                typename GroupedConvTraitsType::FixedGemmParams::ELayout,
+                CDEElementWise,
+                TilePartitioner::MPerBlock,
+                TilePartitioner::NPerBlock,
+                ConvConfig::M_Warp,
+                ConvConfig::N_Warp,
+                ConvConfig::M_Warp_Tile,
+                ConvConfig::N_Warp_Tile,
+                ConvConfig::K_Warp_Tile,
+                GroupedConvTraitsType::FixedGemmParams::TransposeC,
+                memory_operation,
+                ConvConfig::NumWaveGroups,
+                GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+                GroupedConvTraitsType::VectorSizeC>>;
 
-                using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
-                                                                             TilePartitioner,
-                                                                             GemmPipeline,
-                                                                             ConvEpilogue>;
-                auto kargs   = Kernel::MakeKernelArgs(args);
+            using Kernel = ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
+                                                                         TilePartitioner,
+                                                                         GemmPipeline,
+                                                                         ConvEpilogue>;
+            auto kargs   = Kernel::MakeKernelArgs(args);
 
-                const dim3 grids  = Kernel::GridSize(args);
-                const dim3 blocks = Kernel::BlockSize();
+            const dim3 grids  = Kernel::GridSize(args);
+            const dim3 blocks = Kernel::BlockSize();
 
-                if(!Kernel::IsSupportedArgument(kargs))
-                {
-                    throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-                }
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+            }
 
-                if(s.log_level_ > 0)
-                {
-                    std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                              << "shape: " << GemmShape::GetName() << '\n'
-                              << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                              << "pipeline: " << GemmPipeline::GetName() << '\n'
-                              << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                              << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                              << "}" << '\n'
-                              << "Vector size A: " << GemmPipeline::GetVectorSizeA()
-                              << ", Vector size B: " << GemmPipeline::GetVectorSizeB()
-                              << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-                }
+            if(s.log_level_ > 0)
+            {
+                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                          << "shape: " << GemmShape::GetName() << '\n'
+                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                          << "pipeline: " << GemmPipeline::GetName() << '\n'
+                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                          << "}" << '\n'
+                          << "Vector size A: " << GemmPipeline::GetVectorSizeA()
+                          << ", Vector size B: " << GemmPipeline::GetVectorSizeB()
+                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+            }
 
-                auto preprocess = [&]() {
-                    ck_tile::hip_check_error(hipMemsetAsync(
-                        kargs.in_ptr, 0, args.template GetInputByte<InDataType>(), s.stream_id_));
-                };
-
-                ave_time = ck_tile::launch_kernel_time_mask(
-                    s,
-                    preprocess,
-                    ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
-
-                return ave_time;
+            auto preprocess = [&]() {
+                ck_tile::hip_check_error(hipMemsetAsync(
+                    kargs.in_ptr, 0, args.template GetInputByte<InDataType>(), s.stream_id_));
             };
 
+            ave_time = ck_tile::launch_kernel_time_mask(
+                s,
+                preprocess,
+                ck_tile::make_kernel<ConvConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+
+            return ave_time;
+        };
+
         const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
             if(args.k_batch == 1)
             {
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp
index 0c00bb78e1..81b9d402ce 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp
@@ -21,48 +21,42 @@ struct GroupedConvolutionBackwardWeightInvoker
     static float grouped_conv_bwd_weight(const ck_tile::GroupedConvBwdWeightHostArgs& args,
                                          const ck_tile::stream_config& s)
     {
-        constexpr int kBlockPerCu = 1;
-
         // Implicit GEMM Traits
         using GemmShape = ck_tile::TileGemmShape<
             ck_tile::sequence<ConvConfig::M_Tile, ConvConfig::N_Tile, ConvConfig::K_Tile>,
             ck_tile::sequence<ConvConfig::M_Warp, ConvConfig::N_Warp, ConvConfig::K_Warp>,
-            ck_tile::
-                sequence<ConvConfig::M_Warp_Tile, ConvConfig::N_Warp_Tile, ConvConfig::K_Warp_Tile>,
-            ConvConfig::PermuteA,
-            ConvConfig::PermuteB>;
+            ck_tile::sequence<ConvConfig::M_Warp_Tile,
+                              ConvConfig::N_Warp_Tile,
+                              ConvConfig::K_Warp_Tile>>;
 
-        constexpr ck_tile::index_t VectorSizeA = ConvConfig::VectorSizeA;
-        constexpr ck_tile::index_t VectorSizeB = ConvConfig::VectorSizeB;
-        constexpr ck_tile::index_t VectorSizeC = ConvConfig::VectorSizeC;
-
-        constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
-        using TilePartitioner =
-            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
-                                                       ConvConfig::TileParitionerGroupNum,
-                                                       ConvConfig::TileParitionerM01>;
+        constexpr auto ConvSpec     = ck_tile::ConvolutionSpecialization::Default;
         using GroupedConvTraitsType = ck_tile::GroupedConvTraits<NDimSpatial,
                                                                  ConvSpec,
                                                                  InLayout,
                                                                  WeiLayout,
                                                                  DsLayout,
                                                                  OutLayout,
-                                                                 VectorSizeA,
-                                                                 VectorSizeB,
-                                                                 VectorSizeC,
+                                                                 ConvConfig::VectorSizeA,
+                                                                 ConvConfig::VectorSizeB,
+                                                                 ConvConfig::VectorSizeC,
                                                                  ConvConfig::NumGroupsToMerge>;
 
+        using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
+            GemmShape,
+            GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
+            GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
+
         using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
-            ConvConfig::kPadM,
-            ConvConfig::kPadN,
-            ConvConfig::kPadK,
+            GroupedConvTraitsType::FixedGemmParams::kPadM,
+            GroupedConvTraitsType::FixedGemmParams::kPadN,
+            GroupedConvTraitsType::FixedGemmParams::kPadK,
             ConvConfig::DoubleSmemBuffer,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight::AsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight::BsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight::CLayout,
-            ConvConfig::TransposeC,
-            ConvConfig::UseStructuredSparsity,
-            false, // Persistent,
+            typename GroupedConvTraitsType::AsLayoutBwdWeight,
+            typename GroupedConvTraitsType::BsLayoutBwdWeight,
+            typename GroupedConvTraitsType::CLayoutBwdWeight,
+            GroupedConvTraitsType::FixedGemmParams::TransposeC,
+            GroupedConvTraitsType::FixedGemmParams::UseStructuredSparsity,
+            GroupedConvTraitsType::FixedGemmParams::Persistent,
             ConvConfig::NumWaveGroups>;
 
         using GemmPipelineProblem = ck_tile::GemmPipelineProblem<
@@ -70,13 +64,14 @@ struct GroupedConvolutionBackwardWeightInvoker
             InDataType,
             AccDataType,
             GemmShape,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight,
+            typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdWeight<
+                ConvConfig::NumWaveGroups>,
             ck_tile::element_wise::PassThrough,
             ck_tile::element_wise::PassThrough,
             WeiDataType,
-            true,
-            VectorSizeA,
-            VectorSizeB>;
+            GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
 
         using BaseGemmPipeline = typename PipelineTypeTraits<
             ConvConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
@@ -102,21 +97,21 @@ struct GroupedConvolutionBackwardWeightInvoker
             constexpr auto scheduler        = ConvConfig::Scheduler;
             constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem =
-                ck_tile::UniversalGemmPipelineProblem<OutDataType,
-                                                      InDataType,
-                                                      AccDataType,
-                                                      GemmShape,
-                                                      GemmUniversalTraits,
-                                                      scheduler,
-                                                      has_hot_loop_v,
-                                                      tail_number_v,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      WeiDataType,
-                                                      true,
-                                                      VectorSizeA,
-                                                      VectorSizeB>;
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<
+                OutDataType,
+                InDataType,
+                AccDataType,
+                GemmShape,
+                GemmUniversalTraits,
+                scheduler,
+                has_hot_loop_v,
+                tail_number_v,
+                ck_tile::element_wise::PassThrough,
+                ck_tile::element_wise::PassThrough,
+                WeiDataType,
+                GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+                GroupedConvTraitsType::VectorSizeA,
+                GroupedConvTraitsType::VectorSizeB>;
 
             using GemmPipeline = typename PipelineTypeTraits<
                 ConvConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
@@ -128,7 +123,7 @@ struct GroupedConvolutionBackwardWeightInvoker
                 AccDataType,
                 WeiDataType,
                 typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                ck_tile::tensor_layout::gemm::RowMajor,
+                typename GroupedConvTraitsType::FixedGemmParams::ELayout,
                 CDEElementWise,
                 TilePartitioner::MPerBlock,
                 TilePartitioner::NPerBlock,
@@ -137,10 +132,10 @@ struct GroupedConvolutionBackwardWeightInvoker
                 ConvConfig::M_Warp_Tile,
                 ConvConfig::N_Warp_Tile,
                 ConvConfig::K_Warp_Tile,
-                ConvConfig::TransposeC,
+                GroupedConvTraitsType::FixedGemmParams::TransposeC,
                 memory_operation,
-                1,
-                true,
+                ConvConfig::NumWaveGroups,
+                GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
                 GroupedConvTraitsType::VectorSizeC>>;
 
             using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
@@ -185,7 +180,7 @@ struct GroupedConvolutionBackwardWeightInvoker
             ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 preprocess,
-                ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                ck_tile::make_kernel<ConvConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
             return ave_time;
         };
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
index 047d143553..8cef2bde65 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp
@@ -23,47 +23,42 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
     {
         using WorkspaceDataType = float;
 
-        constexpr int kBlockPerCu = 1;
-
         // Implicit GEMM Traits
         using GemmShape = ck_tile::TileGemmShape<
             ck_tile::sequence<ConvConfig::M_Tile, ConvConfig::N_Tile, ConvConfig::K_Tile>,
             ck_tile::sequence<ConvConfig::M_Warp, ConvConfig::N_Warp, ConvConfig::K_Warp>,
-            ck_tile::
-                sequence<ConvConfig::M_Warp_Tile, ConvConfig::N_Warp_Tile, ConvConfig::K_Warp_Tile>,
-            ConvConfig::PermuteA,
-            ConvConfig::PermuteB>;
+            ck_tile::sequence<ConvConfig::M_Warp_Tile,
+                              ConvConfig::N_Warp_Tile,
+                              ConvConfig::K_Warp_Tile>>;
 
-        constexpr ck_tile::index_t VectorSizeA = 4;
-        constexpr ck_tile::index_t VectorSizeB = 8;
-        constexpr ck_tile::index_t VectorSizeC = 8;
-
-        constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
-        using TilePartitioner =
-            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
-                                                       ConvConfig::TileParitionerGroupNum,
-                                                       ConvConfig::TileParitionerM01>;
+        constexpr auto ConvSpec     = ck_tile::ConvolutionSpecialization::Default;
         using GroupedConvTraitsType = ck_tile::GroupedConvTraits<NDimSpatial,
                                                                  ConvSpec,
                                                                  InLayout,
                                                                  WeiLayout,
                                                                  DsLayout,
                                                                  OutLayout,
-                                                                 VectorSizeA,
-                                                                 VectorSizeB,
-                                                                 VectorSizeC>;
+                                                                 ConvConfig::VectorSizeA,
+                                                                 ConvConfig::VectorSizeB,
+                                                                 ConvConfig::VectorSizeC,
+                                                                 ConvConfig::NumGroupsToMerge>;
+
+        using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
+            GemmShape,
+            GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
+            GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
 
         using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
-            ConvConfig::kPadM,
-            ConvConfig::kPadN,
-            ConvConfig::kPadK,
+            GroupedConvTraitsType::FixedGemmParams::kPadM,
+            GroupedConvTraitsType::FixedGemmParams::kPadN,
+            GroupedConvTraitsType::FixedGemmParams::kPadK,
             ConvConfig::DoubleSmemBuffer,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight::AsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight::BsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight::CLayout,
-            ConvConfig::TransposeC,
-            ConvConfig::UseStructuredSparsity,
-            false, // Persistent,
+            typename GroupedConvTraitsType::AsLayoutBwdWeight,
+            typename GroupedConvTraitsType::BsLayoutBwdWeight,
+            typename GroupedConvTraitsType::CLayoutBwdWeight,
+            GroupedConvTraitsType::FixedGemmParams::TransposeC,
+            GroupedConvTraitsType::FixedGemmParams::UseStructuredSparsity,
+            GroupedConvTraitsType::FixedGemmParams::Persistent,
             ConvConfig::NumWaveGroups>;
 
         using GemmPipelineProblem = ck_tile::GemmPipelineProblem<
@@ -71,13 +66,14 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
             InDataType,
             AccDataType,
             GemmShape,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdWeight,
+            typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdWeight<
+                ConvConfig::NumWaveGroups>,
             ck_tile::element_wise::PassThrough,
             ck_tile::element_wise::PassThrough,
             WeiDataType,
-            true,
-            VectorSizeA,
-            VectorSizeB>;
+            GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
 
         using BaseGemmPipeline = typename PipelineTypeTraits<
             ConvConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
@@ -103,21 +99,21 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
             constexpr auto scheduler        = ConvConfig::Scheduler;
             constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem =
-                ck_tile::UniversalGemmPipelineProblem<OutDataType,
-                                                      InDataType,
-                                                      AccDataType,
-                                                      GemmShape,
-                                                      GemmUniversalTraits,
-                                                      scheduler,
-                                                      has_hot_loop_v,
-                                                      tail_number_v,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      WeiDataType,
-                                                      true,
-                                                      VectorSizeA,
-                                                      VectorSizeB>;
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<
+                OutDataType,
+                InDataType,
+                AccDataType,
+                GemmShape,
+                GemmUniversalTraits,
+                scheduler,
+                has_hot_loop_v,
+                tail_number_v,
+                ck_tile::element_wise::PassThrough,
+                ck_tile::element_wise::PassThrough,
+                WeiDataType,
+                GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+                GroupedConvTraitsType::VectorSizeA,
+                GroupedConvTraitsType::VectorSizeB>;
 
             using GemmPipeline = typename PipelineTypeTraits<
                 ConvConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
@@ -129,7 +125,7 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
                 AccDataType,
                 WorkspaceDataType, // C: Workspace  normally Out
                 typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                ck_tile::tensor_layout::gemm::RowMajor,
+                typename GroupedConvTraitsType::FixedGemmParams::ELayout,
                 CDEElementWise,
                 TilePartitioner::MPerBlock,
                 TilePartitioner::NPerBlock,
@@ -140,8 +136,8 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
                 ConvConfig::K_Warp_Tile,
                 GemmPipelineProblem::TransposeC,
                 memory_operation,
-                1,
-                true,
+                ConvConfig::NumWaveGroups,
+                GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
                 GroupedConvTraitsType::VectorSizeC>>;
 
             using Kernel = ck_tile::GroupedConvolutionBackwardWeightKernel<GroupedConvTraitsType,
@@ -236,16 +232,17 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker
             ave_time = ck_tile::launch_kernel_time_mask(
                 s,
                 preprocess,
-                ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs),
-                ck_tile::make_kernel<kBlockPerCu>(ElementwiseKernel{},
-                                                  kGridSize,
-                                                  kBlockSize,
-                                                  0,
-                                                  input_size,
-                                                  ck_tile::make_tuple(shape[1], 1), // Input Stride
-                                                  ck_tile::make_tuple(shape[1], 1), // Output Stride
-                                                  input_tensors,
-                                                  static_cast<WeiDataType*>(c_ptr)));
+                ck_tile::make_kernel<ConvConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs),
+                ck_tile::make_kernel<ConvConfig::kBlockPerCu>(
+                    ElementwiseKernel{},
+                    kGridSize,
+                    kBlockSize,
+                    0,
+                    input_size,
+                    ck_tile::make_tuple(shape[1], 1), // Input Stride
+                    ck_tile::make_tuple(shape[1], 1), // Output Stride
+                    input_tensors,
+                    static_cast<WeiDataType*>(c_ptr)));
 
             return ave_time;
         };
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
index b979d4feb3..bef404b53a 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward.cpp
@@ -14,7 +14,7 @@
 #include "grouped_convolution_forward_invoker.hpp"
 #include "run_grouped_convolution_fwd_example.inc"
 
-template <template <typename PrecType> typename GemmConfig>
+template <template <typename PrecType> typename ConvConfig>
 int run_grouped_conv_fwd_example(int argc, char* argv[])
 {
     using Invoker = GroupedConvolutionForwardInvoker;
@@ -31,14 +31,14 @@ int run_grouped_conv_fwd_example(int argc, char* argv[])
     if(data_type == "fp16")
     {
         return run_grouped_conv_fwd_example_prec_type<Invoker,
-                                                      GemmConfig<ck_tile::half_t>,
+                                                      ConvConfig<ck_tile::half_t>,
                                                       ck_tile::half_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else if(data_type == "bf16")
     {
         return run_grouped_conv_fwd_example_prec_type<Invoker,
-                                                      GemmConfig<ck_tile::bf16_t>,
+                                                      ConvConfig<ck_tile::bf16_t>,
                                                       ck_tile::bf16_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_bias_clamp.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_bias_clamp.cpp
index bd31a99a7e..47395275b2 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_bias_clamp.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_bias_clamp.cpp
@@ -14,7 +14,7 @@
 #include "grouped_convolution_forward_invoker.hpp"
 #include "run_grouped_convolution_fwd_bias_clamp_example.inc"
 
-template <template <typename PrecType> typename GemmConfig>
+template <template <typename PrecType> typename ConvConfig>
 int run_grouped_conv_fwd_bias_clamp_example(int argc, char* argv[])
 {
     using Invoker = GroupedConvolutionForwardInvoker;
@@ -31,14 +31,14 @@ int run_grouped_conv_fwd_bias_clamp_example(int argc, char* argv[])
     if(data_type == "fp16")
     {
         return run_grouped_conv_fwd_bias_clamp_example_prec_type<Invoker,
-                                                                 GemmConfig<ck_tile::half_t>,
+                                                                 ConvConfig<ck_tile::half_t>,
                                                                  ck_tile::half_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else if(data_type == "bf16")
     {
         return run_grouped_conv_fwd_bias_clamp_example_prec_type<Invoker,
-                                                                 GemmConfig<ck_tile::bf16_t>,
+                                                                 ConvConfig<ck_tile::bf16_t>,
                                                                  ck_tile::bf16_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
index 89922fc07b..7c8269d13c 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_invoker.hpp
@@ -14,7 +14,7 @@
 struct GroupedConvolutionForwardInvoker
 {
     template <ck_tile::index_t NDimSpatial,
-              typename GemmConfig,
+              typename ConvConfig,
               typename InDataType,
               typename WeiDataType,
               typename AccDataType,
@@ -32,68 +32,60 @@ struct GroupedConvolutionForwardInvoker
         {
             std::cout << "[INVOKER] grouped_conv_fwd called, NDimSpatial=" << NDimSpatial << "\n";
         }
-        constexpr int kBlockPerCu = 1;
-
         // Implicit GEMM Traits
         using GemmShape = ck_tile::TileGemmShape<
-            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-            ck_tile::
-                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-            GemmConfig::PermuteA,
-            GemmConfig::PermuteB>;
+            ck_tile::sequence<ConvConfig::M_Tile, ConvConfig::N_Tile, ConvConfig::K_Tile>,
+            ck_tile::sequence<ConvConfig::M_Warp, ConvConfig::N_Warp, ConvConfig::K_Warp>,
+            ck_tile::sequence<ConvConfig::M_Warp_Tile,
+                              ConvConfig::N_Warp_Tile,
+                              ConvConfig::K_Warp_Tile>>;
 
-        constexpr ck_tile::index_t VectorSizeA      = 8;
-        constexpr ck_tile::index_t VectorSizeB      = 8;
-        constexpr ck_tile::index_t VectorSizeC      = 8;
-        constexpr ck_tile::index_t NumGroupsToMerge = 1;
-
-        constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
-        using TilePartitioner =
-            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
-                                                       GemmConfig::TileParitionerGroupNum,
-                                                       GemmConfig::TileParitionerM01>;
+        constexpr auto ConvSpec     = ck_tile::ConvolutionSpecialization::Default;
         using GroupedConvTraitsType = ck_tile::GroupedConvTraits<NDimSpatial,
                                                                  ConvSpec,
                                                                  InLayout,
                                                                  WeiLayout,
                                                                  DsLayout,
                                                                  OutLayout,
-                                                                 VectorSizeA,
-                                                                 VectorSizeB,
-                                                                 VectorSizeC,
-                                                                 NumGroupsToMerge,
-                                                                 CDElementWise>;
+                                                                 ConvConfig::VectorSizeA,
+                                                                 ConvConfig::VectorSizeB,
+                                                                 ConvConfig::VectorSizeC,
+                                                                 ConvConfig::NumGroupsToMerge>;
+
+        using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
+            GemmShape,
+            GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
+            GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
 
         using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
-            GemmConfig::kPadM,
-            GemmConfig::kPadN,
-            GemmConfig::kPadK,
-            GemmConfig::DoubleSmemBuffer,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd::AsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd::BsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd::CLayout,
-            GemmConfig::TransposeC,
-            GemmConfig::UseStructuredSparsity,
-            false, // Persistent,
-            GemmConfig::NumWaveGroups,
-            GemmConfig::Preshuffle>;
+            GroupedConvTraitsType::FixedGemmParams::kPadM,
+            GroupedConvTraitsType::FixedGemmParams::kPadN,
+            GroupedConvTraitsType::FixedGemmParams::kPadK,
+            ConvConfig::DoubleSmemBuffer,
+            typename GroupedConvTraitsType::AsLayoutFwd,
+            typename GroupedConvTraitsType::BsLayoutFwd,
+            typename GroupedConvTraitsType::CLayoutFwd,
+            GroupedConvTraitsType::FixedGemmParams::TransposeC,
+            GroupedConvTraitsType::FixedGemmParams::UseStructuredSparsity,
+            GroupedConvTraitsType::FixedGemmParams::Persistent,
+            ConvConfig::NumWaveGroups>;
 
         using GemmPipelineProblem = ck_tile::GemmPipelineProblem<
             InDataType,
             WeiDataType,
             AccDataType,
             GemmShape,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd,
+            typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsFwd<
+                ConvConfig::NumWaveGroups>,
             ck_tile::element_wise::PassThrough,
             ck_tile::element_wise::PassThrough,
             OutDataType,
-            true,
-            VectorSizeA,
-            VectorSizeB>;
+            GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+            GroupedConvTraitsType::VectorSizeA,
+            GroupedConvTraitsType::VectorSizeB>;
 
         using BaseGemmPipeline = typename PipelineTypeTraits<
-            GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+            ConvConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
 
         const ck_tile::index_t gemm_k =
             args.C_ * std::accumulate(args.filter_spatial_lengths_.begin(),
@@ -102,8 +94,8 @@ struct GroupedConvolutionForwardInvoker
                                       std::multiplies<ck_tile::index_t>());
 
         // Split-K parameters
-        const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
-        const ck_tile::index_t K_split     = (gemm_k + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+        const ck_tile::index_t k_grain     = args.k_batch * ConvConfig::K_Tile;
+        const ck_tile::index_t K_split     = (gemm_k + k_grain - 1) / k_grain * ConvConfig::K_Tile;
         const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
         const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
         const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
@@ -112,89 +104,88 @@ struct GroupedConvolutionForwardInvoker
         // =====================================================================
         // Regular Convolution: Simple, no split-image
         // =====================================================================
-        const auto Run = [&]<bool EnableSplitImage>(const auto has_hot_loop_,
-                                                    const auto tail_number_,
-                                                    const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GemmConfig::Scheduler;
-            constexpr auto memory_operation = memory_operation_.value;
+        const auto Run =
+            [&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
+                constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+                constexpr auto tail_number_v    = tail_number_.value;
+                constexpr auto scheduler        = ConvConfig::Scheduler;
+                constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem =
-                ck_tile::UniversalGemmPipelineProblem<InDataType,
-                                                      WeiDataType,
-                                                      AccDataType,
-                                                      GemmShape,
-                                                      GemmUniversalTraits,
-                                                      scheduler,
-                                                      has_hot_loop_v,
-                                                      tail_number_v,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      OutDataType,
-                                                      true,
-                                                      VectorSizeA,
-                                                      VectorSizeB>;
+                using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<
+                    InDataType,
+                    WeiDataType,
+                    AccDataType,
+                    GemmShape,
+                    GemmUniversalTraits,
+                    scheduler,
+                    has_hot_loop_v,
+                    tail_number_v,
+                    ck_tile::element_wise::PassThrough,
+                    ck_tile::element_wise::PassThrough,
+                    OutDataType,
+                    GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+                    GroupedConvTraitsType::VectorSizeA,
+                    GroupedConvTraitsType::VectorSizeB>;
 
-            using GemmPipeline = typename PipelineTypeTraits<
-                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+                using GemmPipeline = typename PipelineTypeTraits<
+                    ConvConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
 
-            using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
-                InDataType,
-                WeiDataType,
-                DsDataType,
-                AccDataType,
-                OutDataType,
-                typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                ck_tile::tensor_layout::gemm::RowMajor,
-                CDElementWise,
-                TilePartitioner::MPerBlock,
-                TilePartitioner::NPerBlock,
-                GemmConfig::M_Warp,
-                GemmConfig::N_Warp,
-                GemmConfig::M_Warp_Tile,
-                GemmConfig::N_Warp_Tile,
-                GemmConfig::K_Warp_Tile,
-                GemmConfig::TransposeC,
-                memory_operation,
-                1,
-                true,
-                GroupedConvTraitsType::VectorSizeC>>;
+                using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
+                    InDataType,
+                    WeiDataType,
+                    DsDataType,
+                    AccDataType,
+                    OutDataType,
+                    typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                    typename GroupedConvTraitsType::FixedGemmParams::ELayout,
+                    CDElementWise,
+                    TilePartitioner::MPerBlock,
+                    TilePartitioner::NPerBlock,
+                    ConvConfig::M_Warp,
+                    ConvConfig::N_Warp,
+                    ConvConfig::M_Warp_Tile,
+                    ConvConfig::N_Warp_Tile,
+                    ConvConfig::K_Warp_Tile,
+                    GroupedConvTraitsType::FixedGemmParams::TransposeC,
+                    memory_operation,
+                    ConvConfig::NumWaveGroups,
+                    GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+                    GroupedConvTraitsType::VectorSizeC>>;
 
-            using Kernel = ck_tile::GroupedConvolutionForwardKernel<EnableSplitImage,
-                                                                    GroupedConvTraitsType,
-                                                                    TilePartitioner,
-                                                                    GemmPipeline,
-                                                                    ConvEpilogue>;
-            auto kargs   = Kernel::MakeKernelArgs(args);
+                using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                                        TilePartitioner,
+                                                                        GemmPipeline,
+                                                                        ConvEpilogue>;
+                auto kargs   = Kernel::MakeKernelArgs(args);
 
-            const dim3 grids  = Kernel::GridSize(kargs);
-            const dim3 blocks = Kernel::BlockSize();
+                const dim3 grids  = Kernel::GridSize(kargs);
+                const dim3 blocks = Kernel::BlockSize();
 
-            if(!Kernel::IsSupportedArgument(kargs))
-            {
-                throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
-            }
+                if(!Kernel::IsSupportedArgument(kargs))
+                {
+                    throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
+                }
 
-            if(s.log_level_ > 0)
-            {
-                std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
-                          << "shape: " << GemmShape::GetName() << '\n'
-                          << "problem: " << UniversalGemmProblem::GetName() << '\n'
-                          << "pipeline: " << GemmPipeline::GetName() << '\n'
-                          << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
-                          << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
-                          << "}" << '\n'
-                          << "Vector size A: " << GemmPipeline::GetVectorSizeA()
-                          << ", Vector size B: " << GemmPipeline::GetVectorSizeB()
-                          << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
-            }
+                if(s.log_level_ > 0)
+                {
+                    std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
+                              << "shape: " << GemmShape::GetName() << '\n'
+                              << "problem: " << UniversalGemmProblem::GetName() << '\n'
+                              << "pipeline: " << GemmPipeline::GetName() << '\n'
+                              << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
+                              << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z
+                              << "}" << '\n'
+                              << "Vector size A: " << GemmPipeline::GetVectorSizeA()
+                              << ", Vector size B: " << GemmPipeline::GetVectorSizeB()
+                              << ", Vector size C: " << ConvEpilogue::GetVectorSizeC() << std::endl;
+                }
 
-            ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                ave_time = ck_tile::launch_kernel(s,
+                                                  ck_tile::make_kernel<ConvConfig::kBlockPerCu>(
+                                                      Kernel{}, grids, blocks, 0, kargs));
 
-            return ave_time;
-        };
+                return ave_time;
+            };
 
         // =====================================================================
         // Split-K lambda
@@ -202,11 +193,11 @@ struct GroupedConvolutionForwardInvoker
         const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
             if(args.k_batch == 1)
             {
-                Run.template operator()<false>(has_hot_loop_, tail_number_, MemoryOpSet{});
+                Run.template operator()(has_hot_loop_, tail_number_, MemoryOpSet{});
             }
             else
             {
-                Run.template operator()<false>(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
+                Run.template operator()(has_hot_loop_, tail_number_, MemoryOpAtomicAdd{});
             }
         };
 
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor.cpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor.cpp
index 9534f19711..a76d0874ee 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor.cpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor.cpp
@@ -19,7 +19,7 @@
 #include "grouped_convolution_forward_large_tensor_invoker.hpp"
 #include "run_grouped_convolution_fwd_example.inc"
 
-template <template <typename PrecType> typename GemmConfig>
+template <template <typename PrecType> typename ConvConfig>
 int run_grouped_conv_fwd_example(int argc, char* argv[])
 {
     using Invoker = GroupedConvolutionForwardInvoker;
@@ -36,14 +36,14 @@ int run_grouped_conv_fwd_example(int argc, char* argv[])
     if(data_type == "fp16")
     {
         return run_grouped_conv_fwd_example_prec_type<Invoker,
-                                                      GemmConfig<ck_tile::half_t>,
+                                                      ConvConfig<ck_tile::half_t>,
                                                       ck_tile::half_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
     else if(data_type == "bf16")
     {
         return run_grouped_conv_fwd_example_prec_type<Invoker,
-                                                      GemmConfig<ck_tile::bf16_t>,
+                                                      ConvConfig<ck_tile::bf16_t>,
                                                       ck_tile::bf16_t>(
             in_layout, wei_layout, out_layout, argc, argv);
     }
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp
index 6a76057d73..9d2752727c 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_forward_large_tensor_invoker.hpp
@@ -7,7 +7,7 @@
 struct GroupedConvolutionForwardInvoker
 {
     template <ck_tile::index_t NDimSpatial,
-              typename GemmConfig,
+              typename ConvConfig,
               typename InDataType,
               typename WeiDataType,
               typename AccDataType,
@@ -25,65 +25,75 @@ struct GroupedConvolutionForwardInvoker
         {
             std::cout << "[INVOKER] grouped_conv_fwd called, NDimSpatial=" << NDimSpatial << "\n";
         }
-        constexpr int kBlockPerCu = 1;
 
         // Implicit GEMM Traits
         using GemmShape = ck_tile::TileGemmShape<
-            ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
-            ck_tile::sequence<GemmConfig::M_Warp, GemmConfig::N_Warp, GemmConfig::K_Warp>,
-            ck_tile::
-                sequence<GemmConfig::M_Warp_Tile, GemmConfig::N_Warp_Tile, GemmConfig::K_Warp_Tile>,
-            GemmConfig::PermuteA,
-            GemmConfig::PermuteB>;
-
-        constexpr ck_tile::index_t VectorSizeA = 8;
-        constexpr ck_tile::index_t VectorSizeB = 8;
-        constexpr ck_tile::index_t VectorSizeC = 8;
+            ck_tile::sequence<ConvConfig::M_Tile, ConvConfig::N_Tile, ConvConfig::K_Tile>,
+            ck_tile::sequence<ConvConfig::M_Warp, ConvConfig::N_Warp, ConvConfig::K_Warp>,
+            ck_tile::sequence<ConvConfig::M_Warp_Tile,
+                              ConvConfig::N_Warp_Tile,
+                              ConvConfig::K_Warp_Tile>>;
 
         constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
-        using TilePartitioner =
-            ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
-                                                       GemmConfig::TileParitionerGroupNum,
-                                                       GemmConfig::TileParitionerM01>;
-        using GroupedConvTraitsType = ck_tile::GroupedConvTraits<NDimSpatial,
-                                                                 ConvSpec,
-                                                                 InLayout,
-                                                                 WeiLayout,
-                                                                 DsLayout,
-                                                                 OutLayout,
-                                                                 VectorSizeA,
-                                                                 VectorSizeB,
-                                                                 VectorSizeC>;
+        using GroupedConvTraitsTypeDefault =
+            ck_tile::GroupedConvTraits<NDimSpatial,
+                                       ConvSpec,
+                                       InLayout,
+                                       WeiLayout,
+                                       DsLayout,
+                                       OutLayout,
+                                       ConvConfig::VectorSizeA,
+                                       ConvConfig::VectorSizeB,
+                                       ConvConfig::VectorSizeC,
+                                       ConvConfig::NumGroupsToMerge>;
+
+        using GroupedConvTraitsTypeLargeTensor =
+            ck_tile::GroupedConvTraits<NDimSpatial,
+                                       ConvSpec,
+                                       InLayout,
+                                       WeiLayout,
+                                       DsLayout,
+                                       OutLayout,
+                                       ConvConfig::VectorSizeA,
+                                       ConvConfig::VectorSizeB,
+                                       ConvConfig::VectorSizeC,
+                                       ConvConfig::NumGroupsToMerge,
+                                       true /*EnableSplitImage*/>;
+
+        using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
+            GemmShape,
+            GroupedConvTraitsTypeDefault::FixedGemmParams::TilePartitionerGroupNum,
+            GroupedConvTraitsTypeDefault::FixedGemmParams::TilePartitionerM01>;
 
         using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
-            GemmConfig::kPadM,
-            GemmConfig::kPadN,
-            GemmConfig::kPadK,
-            GemmConfig::DoubleSmemBuffer,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd::AsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd::BsLayout,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd::CLayout,
-            GemmConfig::TransposeC,
-            GemmConfig::UseStructuredSparsity,
-            false, // Persistent,
-            GemmConfig::NumWaveGroups,
-            GemmConfig::Preshuffle>;
+            GroupedConvTraitsTypeDefault::FixedGemmParams::kPadM,
+            GroupedConvTraitsTypeDefault::FixedGemmParams::kPadN,
+            GroupedConvTraitsTypeDefault::FixedGemmParams::kPadK,
+            ConvConfig::DoubleSmemBuffer,
+            typename GroupedConvTraitsTypeDefault::AsLayoutFwd,
+            typename GroupedConvTraitsTypeDefault::BsLayoutFwd,
+            typename GroupedConvTraitsTypeDefault::CLayoutFwd,
+            GroupedConvTraitsTypeDefault::FixedGemmParams::TransposeC,
+            GroupedConvTraitsTypeDefault::FixedGemmParams::UseStructuredSparsity,
+            GroupedConvTraitsTypeDefault::FixedGemmParams::Persistent,
+            ConvConfig::NumWaveGroups>;
 
         using GemmPipelineProblem = ck_tile::GemmPipelineProblem<
             InDataType,
             WeiDataType,
             AccDataType,
             GemmShape,
-            typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsFwd,
+            typename GroupedConvTraitsTypeDefault::template GroupedConvImplicitGemmTraitsFwd<
+                ConvConfig::NumWaveGroups>,
             ck_tile::element_wise::PassThrough,
             ck_tile::element_wise::PassThrough,
             OutDataType,
-            true,
-            VectorSizeA,
-            VectorSizeB>;
+            GroupedConvTraitsTypeDefault::FixedGemmParams::FixedVectorSize,
+            GroupedConvTraitsTypeDefault::VectorSizeA,
+            GroupedConvTraitsTypeDefault::VectorSizeB>;
 
         using BaseGemmPipeline = typename PipelineTypeTraits<
-            GemmConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
+            ConvConfig::Pipeline>::template UniversalGemmPipeline<GemmPipelineProblem>;
 
         const ck_tile::index_t gemm_k =
             args.C_ * std::accumulate(args.filter_spatial_lengths_.begin(),
@@ -92,8 +102,8 @@ struct GroupedConvolutionForwardInvoker
                                       std::multiplies<ck_tile::index_t>());
 
         // Split-K parameters
-        const ck_tile::index_t k_grain     = args.k_batch * GemmConfig::K_Tile;
-        const ck_tile::index_t K_split     = (gemm_k + k_grain - 1) / k_grain * GemmConfig::K_Tile;
+        const ck_tile::index_t k_grain     = args.k_batch * ConvConfig::K_Tile;
+        const ck_tile::index_t K_split     = (gemm_k + k_grain - 1) / k_grain * ConvConfig::K_Tile;
         const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
         const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
         const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
@@ -102,9 +112,9 @@ struct GroupedConvolutionForwardInvoker
         using TransformType =
             ck_tile::TransformConvFwdToGemm<NDimSpatial,
                                             ck_tile::ConvolutionSpecialization::Default,
-                                            VectorSizeA,
-                                            VectorSizeB,
-                                            VectorSizeC,
+                                            GroupedConvTraitsTypeDefault::VectorSizeA,
+                                            GroupedConvTraitsTypeDefault::VectorSizeB,
+                                            GroupedConvTraitsTypeDefault::VectorSizeC,
                                             1,     // NumGroupsToMerge
                                             false, // SplitN
                                             InDataType,
@@ -243,27 +253,31 @@ struct GroupedConvolutionForwardInvoker
                                                     const auto memory_operation_) {
             constexpr bool has_hot_loop_v   = has_hot_loop_.value;
             constexpr auto tail_number_v    = tail_number_.value;
-            constexpr auto scheduler        = GemmConfig::Scheduler;
+            constexpr auto scheduler        = ConvConfig::Scheduler;
             constexpr auto memory_operation = memory_operation_.value;
 
-            using UniversalGemmProblem =
-                ck_tile::UniversalGemmPipelineProblem<InDataType,
-                                                      WeiDataType,
-                                                      AccDataType,
-                                                      GemmShape,
-                                                      GemmUniversalTraits,
-                                                      scheduler,
-                                                      has_hot_loop_v,
-                                                      tail_number_v,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      ck_tile::element_wise::PassThrough,
-                                                      OutDataType,
-                                                      true,
-                                                      VectorSizeA,
-                                                      VectorSizeB>;
+            using GroupedConvTraitsType = std::conditional_t<EnableSplitImage,
+                                                             GroupedConvTraitsTypeLargeTensor,
+                                                             GroupedConvTraitsTypeDefault>;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<
+                InDataType,
+                WeiDataType,
+                AccDataType,
+                GemmShape,
+                GemmUniversalTraits,
+                scheduler,
+                has_hot_loop_v,
+                tail_number_v,
+                ck_tile::element_wise::PassThrough,
+                ck_tile::element_wise::PassThrough,
+                OutDataType,
+                GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+                GroupedConvTraitsType::VectorSizeA,
+                GroupedConvTraitsType::VectorSizeB>;
 
             using GemmPipeline = typename PipelineTypeTraits<
-                GemmConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
+                ConvConfig::Pipeline>::template GemmPipeline<UniversalGemmProblem>;
 
             using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
                 InDataType,
@@ -272,24 +286,23 @@ struct GroupedConvolutionForwardInvoker
                 AccDataType,
                 OutDataType,
                 typename GroupedConvTraitsType::ImplicitGemmDsLayout,
-                ck_tile::tensor_layout::gemm::RowMajor,
+                typename GroupedConvTraitsType::FixedGemmParams::ELayout,
                 CDEElementWise,
                 TilePartitioner::MPerBlock,
                 TilePartitioner::NPerBlock,
-                GemmConfig::M_Warp,
-                GemmConfig::N_Warp,
-                GemmConfig::M_Warp_Tile,
-                GemmConfig::N_Warp_Tile,
-                GemmConfig::K_Warp_Tile,
-                GemmConfig::TransposeC,
+                ConvConfig::M_Warp,
+                ConvConfig::N_Warp,
+                ConvConfig::M_Warp_Tile,
+                ConvConfig::N_Warp_Tile,
+                ConvConfig::K_Warp_Tile,
+                GroupedConvTraitsType::FixedGemmParams::TransposeC,
                 memory_operation,
-                1,
-                true,
+                ConvConfig::NumWaveGroups,
+                GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
                 GroupedConvTraitsType::VectorSizeC>>;
 
             // Use split-image kernel if layout supports it, otherwise use regular kernel
-            using Kernel = ck_tile::GroupedConvolutionForwardKernel<EnableSplitImage,
-                                                                    GroupedConvTraitsType,
+            using Kernel = ck_tile::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
                                                                     TilePartitioner,
                                                                     GemmPipeline,
                                                                     ConvEpilogue>;
@@ -351,7 +364,8 @@ struct GroupedConvolutionForwardInvoker
             }
 
             ave_time = ck_tile::launch_kernel(
-                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
+                s,
+                ck_tile::make_kernel<ConvConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
             return ave_time;
         };
diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
index 91fa444f0d..b0e2c02973 100644
--- a/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
+++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_utils.hpp
@@ -11,7 +11,9 @@
 #include "ck_tile/ops/epilogue.hpp"
 #include "ck_tile/ops/gemm.hpp"
 #include "ck_tile/ops/grouped_convolution.hpp"
+#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
 #include "conv_configs.hpp"
+
 using MemoryOpSet =
     std::integral_constant<ck_tile::memory_operation_enum, ck_tile::memory_operation_enum::set>;
 using MemoryOpAtomicAdd = std::integral_constant<ck_tile::memory_operation_enum,
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
index c7be53f2df..e2ebfd6f8e 100644
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_bwd_data_example.inc
@@ -3,7 +3,7 @@
 #pragma once
 
 template <ck_tile::index_t NDimSpatial,
-          typename GemmConfig,
+          typename ConvConfig,
           typename Invoker,
           typename InDataType,
           typename WeiDataType,
@@ -17,7 +17,7 @@ float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args,
                                    int n_repeat)
 {
     float ave_time = Invoker::template grouped_conv_bwd_data<NDimSpatial,
-                                                             GemmConfig,
+                                                             ConvConfig,
                                                              InDataType,
                                                              WeiDataType,
                                                              AccDataType,
@@ -39,7 +39,7 @@ float invoke_grouped_conv_bwd_data(ck_tile::GroupedConvBwdDataHostArgs& args,
 }
 
 template <ck_tile::index_t NDimSpatial,
-          typename GemmConfig,
+          typename ConvConfig,
           typename Invoker,
           typename InDataType,
           typename WeiDataType = InDataType,
@@ -141,7 +141,7 @@ int run_grouped_conv_bwd_data_example_with_layouts(
     std::cout << "output: " << output.mDesc << std::endl;
 
     invoke_grouped_conv_bwd_data<NDimSpatial,
-                                 GemmConfig,
+                                 ConvConfig,
                                  Invoker,
                                  InDataType,
                                  WeiDataType,
@@ -193,7 +193,7 @@ int run_grouped_conv_bwd_data_example_with_layouts(
 }
 
 template <typename Invoker,
-          typename GemmConfig,
+          typename ConvConfig,
           typename InPrecType,
           typename WeiPrecType = InPrecType,
           typename OutPrecType = InPrecType>
@@ -215,7 +215,7 @@ int run_grouped_conv_bwd_data_example_prec_type(
     if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
     {
         return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<1>{},
-                                                              GemmConfig,
+                                                              ConvConfig,
                                                               Invoker,
                                                               InPrecType,
                                                               WeiPrecType,
@@ -225,7 +225,7 @@ int run_grouped_conv_bwd_data_example_prec_type(
     else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
     {
         return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<2>{},
-                                                              GemmConfig,
+                                                              ConvConfig,
                                                               Invoker,
                                                               InPrecType,
                                                               WeiPrecType,
@@ -235,7 +235,7 @@ int run_grouped_conv_bwd_data_example_prec_type(
     else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
     {
         return run_grouped_conv_bwd_data_example_with_layouts<ck_tile::number<3>{},
-                                                              GemmConfig,
+                                                              ConvConfig,
                                                               Invoker,
                                                               InPrecType,
                                                               WeiPrecType,
diff --git a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
index a6b1334d9e..7175a85ba7 100644
--- a/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
+++ b/example/ck_tile/20_grouped_convolution/run_grouped_convolution_fwd_example.inc
@@ -3,7 +3,7 @@
 #pragma once
 
 template <ck_tile::index_t NDimSpatial,
-          typename GemmConfig,
+          typename ConvConfig,
           typename Invoker,
           typename InDataType,
           typename WeiDataType,
@@ -17,7 +17,7 @@ float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs<>& args,
                               int n_repeat)
 {
     float ave_time = Invoker::template grouped_conv_fwd<NDimSpatial,
-                                                        GemmConfig,
+                                                        ConvConfig,
                                                         InDataType,
                                                         WeiDataType,
                                                         AccDataType,
@@ -39,7 +39,7 @@ float invoke_grouped_conv_fwd(const ck_tile::GroupedConvFwdHostArgs<>& args,
 }
 
 template <ck_tile::index_t NDimSpatial,
-          typename GemmConfig,
+          typename ConvConfig,
           typename Invoker,
           typename InDataType,
           typename WeiDataType = InDataType,
@@ -141,7 +141,7 @@ int run_grouped_conv_fwd_example_with_layouts(
     std::cout << "output: " << output.mDesc << std::endl;
 
     invoke_grouped_conv_fwd<NDimSpatial,
-                            GemmConfig,
+                            ConvConfig,
                             Invoker,
                             InDataType,
                             WeiDataType,
@@ -193,7 +193,7 @@ int run_grouped_conv_fwd_example_with_layouts(
 }
 
 template <typename Invoker,
-          typename GemmConfig,
+          typename ConvConfig,
           typename InPrecType,
           typename WeiPrecType = InPrecType,
           typename OutPrecType = InPrecType>
@@ -215,7 +215,7 @@ int run_grouped_conv_fwd_example_prec_type(
     if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
     {
         return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
-                                                         GemmConfig,
+                                                         ConvConfig,
                                                          Invoker,
                                                          InPrecType,
                                                          WeiPrecType,
@@ -225,7 +225,7 @@ int run_grouped_conv_fwd_example_prec_type(
     else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
     {
         return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
-                                                         GemmConfig,
+                                                         ConvConfig,
                                                          Invoker,
                                                          InPrecType,
                                                          WeiPrecType,
@@ -235,7 +235,7 @@ int run_grouped_conv_fwd_example_prec_type(
     else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "NDHWGK")
     {
         return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
-                                                         GemmConfig,
+                                                         ConvConfig,
                                                          Invoker,
                                                          InPrecType,
                                                          WeiPrecType,
diff --git a/example/ck_tile/36_pooling/README.md b/example/ck_tile/36_pooling/README.md
index ab49b57095..4417e03734 100644
--- a/example/ck_tile/36_pooling/README.md
+++ b/example/ck_tile/36_pooling/README.md
@@ -2,6 +2,116 @@
 
 This folder contains example for the pooling operator using ck_tile tile-programming implementation. Currently the pooling kernel only supports 2D and 3D pooling.
 
+## Tensor Descriptor Transformations
+
+The pooling kernel transforms the input tensor into 2D format suitable for reduction. This section explains the transformation pipeline for both 2D and 3D pooling operations.
+
+### 3D Pooling Transformations
+
+For 3D pooling, the input tensor has shape `(N, D, H, W, C)` where:
+- `N`: batch size
+- `D`: depth dimension  
+- `H`: height dimension
+- `W`: width dimension
+- `C`: channel dimension
+
+The transformations convert this 5D tensor into a 2D tensor where rows represent output positions (M) and columns represent pooling window elements (K).
+
+```mermaid
+graph TD
+    %% Input Tensor: (N, D, H, W, C)
+    Input["Input Tensor<br/>(N, D, H, W, C)"]
+    style Input fill:#e1f5fe
+
+    %% Pass-through N dimension
+    PassN["Pass-through N<br/>(batch size)"]
+    style PassN fill:#f3e5f5
+    Input --> PassN
+
+    %% Pad spatial dimensions
+    PadD["Pad D<br/>(depth with left/right padding)"]
+    style PadD fill:#fff9c4
+    Input --> PadD
+
+    PadH["Pad H<br/>(height with left/right padding)"]
+    style PadH fill:#fff9c4
+    Input --> PadH
+
+    PadW["Pad W<br/>(width with left/right padding)"]
+    style PadW fill:#fff9c4
+    Input --> PadW
+
+    %% Pass-through C dimension
+    PassC["Pass-through C<br/>(channels)"]
+    style PassC fill:#f3e5f5
+    Input --> PassC
+
+    %% Embed sliding windows
+    EmbedD["Embed D<br/>window(Z) × output_positions(Dₒ)"]
+    style EmbedD fill:#fff3e0
+    PadD --> EmbedD
+
+    EmbedH["Embed H<br/>window(Y) × output_positions(Hₒ)"]
+    style EmbedH fill:#fff3e0
+    PadH --> EmbedH
+
+    EmbedW["Embed W<br/>window(X) × output_positions(Wₒ)"]
+    style EmbedW fill:#fff3e0
+    PadW --> EmbedW
+
+    %% Merge into 2D matrix
+    MergeM["Merge M<br/>(N, Dₒ, Hₒ, Wₒ, C)<br/>→ output positions"]
+    style MergeM fill:#e8f5e9
+    PassN --> MergeM
+    EmbedD --> MergeM
+    EmbedH --> MergeM
+    EmbedW --> MergeM
+    PassC --> MergeM
+
+    MergeK["Merge K<br/>(Z, Y, X)<br/>→ window elements"]
+    style MergeK fill:#e8f5e9
+    EmbedD --> MergeK
+    EmbedH --> MergeK
+    EmbedW --> MergeK
+
+    %% Final padding for block alignment
+    PadM["Right-pad M<br/>(for block alignment)"]
+    style PadM fill:#fff9c4
+    MergeM --> PadM
+
+    PadK["Right-pad K<br/>(for block alignment)"]
+    style PadK fill:#fff9c4
+    MergeK --> PadK
+
+    %% Result
+    Result["2D Matrix<br/>(M × K)"]
+    style Result fill:#c8e6c9
+    PadM --> Result
+    PadK --> Result
+```
+
+**Transformation Steps:**
+1. **Padding**: Apply left and right padding to spatial dimensions (D, H, W) to handle boundary conditions
+2. **Sliding Windows**: Use embed transforms to create sliding windows across each spatial dimension, expanding each dimension into (window_size, output_positions)
+3. **Reshaping**: Merge all dimensions into a 2D matrix where:
+   - M dimension = N × Dₒ × Hₒ × Wₒ × C (total output positions)
+   - K dimension = Z × Y × X (elements per pooling window)
+4. **Block Alignment**: Apply right padding to ensure M and K dimensions are aligned to block size
+
+### 2D Pooling Transformations
+
+2D pooling follows the same transformation pipeline but operates on 4D tensors with shape `(N, H, W, C)`. The process is identical except:
+- Only H and W dimensions are padded and embedded
+- K dimension merges only (Y, X) window elements
+- M dimension merges (N, Hₒ, Wₒ, C)
+
+### Output Tensor Transformations
+
+The output tensor transformations are simpler:
+- Merge all output dimensions (N, Dₒ/Hₒ, Wₒ, C) into a single M dimension
+- Apply right padding for block alignment
+- The result is a 1D tensor that maps directly to the M dimension of the computation matrix
+
 ## build
 ```
 # in the root of ck_tile
diff --git a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
index 7358d4d749..b1ae9369a2 100644
--- a/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
+++ b/example/ck_tile/38_block_scale_gemm/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 
 list(APPEND EXAMPLE_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
 
-if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     add_executable(tile_example_gemm_quant_basic EXCLUDE_FROM_ALL gemm_quant_basic.cpp)
     target_compile_options(tile_example_gemm_quant_basic PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
 else()
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp b/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp
index b22596537f..d605a2b780 100644
--- a/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_quant_basic.cpp
@@ -419,6 +419,10 @@ int dispatch_group_size_ct(int m, int n, int k, F&& f)
 
 int main(int argc, char* argv[])
 {
+#if CK_TILE_USE_WMMA
+    return !run_gemm_example<GemmConfigBQuantPrefill_Wmma>(argc, argv);
+#else
     // Use non-preshuffled GemmConfig for 2D block scale support
     return !run_gemm_example<GemmConfigBQuantPrefill>(argc, argv);
+#endif
 }
diff --git a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
index aec9dfdde4..1839c7f98d 100644
--- a/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
+++ b/example/ck_tile/38_block_scale_gemm/gemm_utils.hpp
@@ -12,12 +12,11 @@
 #include "ck_tile/ops/gemm_quant.hpp"
 
 #define CK_TILE_SUPPORTED_QUANT_GROUPS(X) \
-    X(1, 1, 64)    /* 1D */               \
-    X(1, 1, 128)   /* 1D */               \
-    X(1, 8, 128)   /* 2D N=8  */          \
-    X(1, 32, 128)  /* 2D N=32 */          \
-    X(1, 64, 128)  /* 2D N=64 */          \
-    X(1, 128, 128) /* 2D N=128 */
+    X(1, 1, 64)   /* 1D */                \
+    X(1, 1, 128)  /* 1D */                \
+    X(1, 8, 128)  /* 2D N=8  */           \
+    X(1, 32, 128) /* 2D N=32 */           \
+    X(1, 64, 128) /* 2D N=64 */
 
 template <typename PrecType, ck_tile::index_t M_Warp_Tile>
 constexpr ck_tile::index_t get_k_warp_tile()
@@ -217,6 +216,14 @@ struct GemmConfigBQuantPrefill : public GemmConfigBase
     static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<PrecType, M_Warp_Tile>();
 };
 
+template <typename PrecType>
+struct GemmConfigBQuantPrefill_Wmma : public GemmConfigBQuantPrefill<PrecType>
+{
+    static constexpr ck_tile::index_t M_Warp_Tile = 16;
+    static constexpr ck_tile::index_t N_Warp_Tile = 16;
+    static constexpr ck_tile::index_t K_Warp_Tile = 16;
+};
+
 template <typename ADataType_,
           typename BDataType_ = ADataType_,
           typename CDataType_ = ADataType_,
diff --git a/example/ck_tile/40_streamk_gemm/README.md b/example/ck_tile/40_streamk_gemm/README.md
index fe9eb0c4f8..0272b1fe97 100644
--- a/example/ck_tile/40_streamk_gemm/README.md
+++ b/example/ck_tile/40_streamk_gemm/README.md
@@ -22,8 +22,8 @@ args:
           -a_layout    tensor A data layout (default: R)
           -b_layout    tensor B data layout (default: C)
           -c_layout    tensor C data layout (default: R)
-     -num_sk_blocks    number of Stream-K blocks. -1: chosen by algorithm, or user selected (default:-1)
 -reduction_strategy    strategy for storing results in C tensor. atomic/reduction (default:atomic)
+     -persistent_dp    persistent strategy for data-parallel section. Set to 0 for non-persistent or to 1 for persistent. (default:0)
           -stride_a    tensor A stride (default:0)
           -stride_b    tensor B stride (default:0)
           -stride_c    tensor C stride (default:0)
diff --git a/example/ck_tile/40_streamk_gemm/gemm_utils.hpp b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp
index abcca7eaec..69095ca3d7 100644
--- a/example/ck_tile/40_streamk_gemm/gemm_utils.hpp
+++ b/example/ck_tile/40_streamk_gemm/gemm_utils.hpp
@@ -18,7 +18,6 @@ struct GemmConfigBase
 
     static constexpr bool TransposeC            = false;
     static constexpr bool UseStructuredSparsity = false;
-    static constexpr bool Persistent            = false;
 
     static constexpr int kBlockPerCu                = 1;
     static constexpr auto Scheduler                 = ck_tile::GemmPipelineScheduler::Intrawave;
@@ -27,12 +26,12 @@ struct GemmConfigBase
     static constexpr bool DoubleSmemBuffer          = false;
 };
 
-template <typename PrecType>
+template <typename PrecType, bool Persistent_>
 struct GemmConfigMemoryInterwave : public GemmConfigBase
 {
-    static constexpr ck_tile::index_t M_Tile = 128;
-    static constexpr ck_tile::index_t N_Tile = 128;
-    static constexpr ck_tile::index_t K_Tile = 32;
+    static constexpr ck_tile::index_t M_Tile = 256;
+    static constexpr ck_tile::index_t N_Tile = 256;
+    static constexpr ck_tile::index_t K_Tile = 16;
 
     static constexpr ck_tile::index_t M_Warp = 2;
     static constexpr ck_tile::index_t N_Warp = 2;
@@ -42,7 +41,8 @@ struct GemmConfigMemoryInterwave : public GemmConfigBase
     static constexpr ck_tile::index_t N_Warp_Tile = 32;
     static constexpr ck_tile::index_t K_Warp_Tile = sizeof(PrecType) == 2 ? 8 : 16;
 
-    static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave;
+    static constexpr bool Persistent = Persistent_;
+    static constexpr auto Scheduler  = ck_tile::GemmPipelineScheduler::Intrawave;
 };
 
 template <typename ADataType_, typename BDataType_ = ADataType_, typename CDataType_ = ADataType_>
@@ -96,12 +96,12 @@ auto create_args(int argc, char* argv[])
         .insert("a_layout", "R", "A tensor data layout - Row by default")
         .insert("b_layout", "C", "B tensor data layout - Column by default")
         .insert("c_layout", "R", "C tensor data layout - Row by default")
-        .insert("num_sk_blocks",
-                "-1",
-                "number of Stream-K blocks. -1: chosen by algorithm, or user selected")
         .insert("reduction_strategy",
                 "atomic",
                 "strategy for storing results in C tensor - atomic/reduction")
+        .insert("persistent_dp",
+                "0",
+                "0. Non-persistent data-parallel section, 1 Fully persistent kernel.")
         .insert("stride_a", "0", "Tensor A stride")
         .insert("stride_b", "0", "Tensor B stride")
         .insert("stride_c", "0", "Tensor C stride")
diff --git a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
index 6dd054ee11..17182d87dc 100644
--- a/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
+++ b/example/ck_tile/40_streamk_gemm/run_gemm_example.inc
@@ -69,20 +69,18 @@ invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
             int n_warmup,
             int n_repeat,
             bool flush_cache,
-            ck_tile::StreamKReductionStrategy reduction_strategy,
-            uint32_t num_sk_blocks)
+            ck_tile::StreamKReductionStrategy reduction_strategy)
 {
-    ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
-                                  b_k_n_dev_buf.GetDeviceBuffer(),
-                                  c_m_n_dev_buf.GetDeviceBuffer(),
-                                  M,
-                                  N,
-                                  K,
-                                  stride_A,
-                                  stride_B,
-                                  stride_C,
-                                  reduction_strategy,
-                                  num_sk_blocks};
+    ck_tile::reboot::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
+                                          b_k_n_dev_buf.GetDeviceBuffer(),
+                                          c_m_n_dev_buf.GetDeviceBuffer(),
+                                          M,
+                                          N,
+                                          K,
+                                          stride_A,
+                                          stride_B,
+                                          stride_C,
+                                          reduction_strategy};
 
     std::tuple<float, ck_tile::index_t> ave_time_and_batch;
 
@@ -197,7 +195,6 @@ int run_gemm_example_with_layouts(int argc,
 
     ck_tile::StreamKReductionStrategy reduction_strategy =
         get_reduction_strategy_value(arg_parser.get_str("reduction_strategy"));
-    uint32_t num_sk_blocks = static_cast<uint32_t>(arg_parser.get_int("num_sk_blocks"));
 
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -261,8 +258,7 @@ int run_gemm_example_with_layouts(int argc,
                                                              n_warmup,
                                                              n_repeat,
                                                              flush_cache,
-                                                             reduction_strategy,
-                                                             num_sk_blocks);
+                                                             reduction_strategy);
 
     c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
 
@@ -279,10 +275,10 @@ int run_gemm_example_with_layouts(int argc,
               << " B_Type=" << DataTypeTraits<BDataType>::name
               << " C_Type=" << DataTypeTraits<CDataType>::name
               << " reduction_strategy=" << arg_parser.get_str("reduction_strategy") << " "
-              << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
-              << std::endl;
+              << " persistent_dp=" << arg_parser.get_str("persistent_dp") << " " << ave_time
+              << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << std::endl;
 
-    bool pass = true;
+    bool pass = false;
 
     // Memory on host to store gpu reference result
     ck_tile::HostTensor<CDataType> c_m_n_ref(
diff --git a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
index 8ec409023d..e04cb00379 100644
--- a/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
+++ b/example/ck_tile/40_streamk_gemm/streamk_gemm_basic.cpp
@@ -2,7 +2,6 @@
 // SPDX-License-Identifier:  MIT
 
 #include "gemm_utils.hpp"
-#include "run_gemm_example.inc"
 #include "ck_tile/ops/common.hpp"
 
 template <typename GemmConfig,
@@ -17,9 +16,8 @@ template <typename GemmConfig,
           typename ELayout,
           typename CDEElementWise,
           ck_tile::StreamKReductionStrategy ReductionStrategy>
-std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
+std::tuple<float, ck_tile::index_t> gemm(const ck_tile::reboot::StreamKHostArgs& args,
                                          const ck_tile::stream_config& s)
-
 {
     using GemmShape = ck_tile::TileGemmShape<
         ck_tile::sequence<GemmConfig::M_Tile, GemmConfig::N_Tile, GemmConfig::K_Tile>,
@@ -29,7 +27,8 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
         GemmConfig::PermuteA,
         GemmConfig::PermuteB>;
 
-    using TilePartitioner = ck_tile::StreamKTilePartitioner<GemmShape, ReductionStrategy>;
+    using TilePartitioner =
+        ck_tile::StreamKTilePartitioner_v2<GemmShape, ReductionStrategy, GemmConfig::Persistent>;
 
     using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<GemmConfig::kPadM,
                                                                  GemmConfig::kPadN,
@@ -78,9 +77,13 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
                                              memory_operation.value,
                                              GemmConfig::NumWaveGroups>>;
 
-        using Kernel = ck_tile::StreamKKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+        using Kernel = ck_tile::reboot::StreamKKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
 
-        auto kargs = Kernel::MakeKernelArgs(args);
+        auto kargs                = Kernel::MakeKernelArgs(args);
+        const auto workspace_size = Kernel::GetWorkSpaceSize(kargs);
+        ck_tile::DeviceMem workspace_data(workspace_size);
+        workspace_data.SetZero();
+        kargs.workspace_ptr = workspace_data.GetDeviceBuffer();
 
         dim3 grids  = Kernel::GridSize(kargs.tile_partitioner);
         dim3 blocks = Kernel::BlockSize();
@@ -101,28 +104,28 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
                       << std::endl;
         }
 
-        // Function to clear the output C tensor results after each repetition of the kernel
-        auto clear_gemm_output = [&]() {
+        auto reset_data_buffers = [&]() {
             if(ReductionStrategy == ck_tile::StreamKReductionStrategy::Atomic)
+            {
+                // Clear the output C tensor results after each repetition of the kernel
                 hipGetErrorString(hipMemsetAsync(
                     args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_));
+            }
+            else if(ReductionStrategy == ck_tile::StreamKReductionStrategy::Reduction)
+            {
+                // Reset sk flags to zero before each repetition of the kernel
+                workspace_data.SetZero();
+            }
         };
 
-        std::function<void()> preprocess = clear_gemm_output;
+        std::function<void()> preprocess = reset_data_buffers;
 
         float ave_time = ck_tile::launch_kernel_time_mask(
             s,
             preprocess,
             ck_tile::make_kernel<GemmConfig::kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
 
-        ck_tile::index_t num_wgs_per_tile = ck_tile::estimate_num_wgs_per_tile<ReductionStrategy>(
-            kargs.tile_partitioner.sk_num_blocks,
-            // k_iters_per_big_block could be 1, which indicates that all Stream-K workgroups are
-            // big and each does one iteration. Thus, we ensure the value passed in is at least 1 to
-            // avoid division by zero errors.
-            ck_tile::max(kargs.tile_partitioner.k_iters_per_big_block - 1, 1u),
-            kargs.tile_partitioner.k_iters_per_tile.get());
-
+        ck_tile::index_t num_wgs_per_tile = kargs.tile_partitioner.estimate_num_wgs_per_tile();
         return std::tuple{ave_time, num_wgs_per_tile};
     };
 
@@ -145,6 +148,8 @@ std::tuple<float, ck_tile::index_t> gemm(const ck_tile::StreamKHostArgs& args,
     }
 }
 
+#include "run_gemm_example.inc"
+
 template <typename GemmConfig, typename TypeConfig>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
@@ -164,7 +169,7 @@ int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int a
     return 0;
 }
 
-template <template <typename PreType> typename GemmConfig>
+template <template <typename PreType, bool Persistent_> typename GemmConfig>
 int run_gemm_example(int argc, char* argv[])
 {
     auto [result, arg_parser] = create_args(argc, argv);
@@ -174,30 +179,63 @@ int run_gemm_example(int argc, char* argv[])
     std::string data_type = arg_parser.get_str("prec");
     std::string a_layout  = arg_parser.get_str("a_layout");
     std::string b_layout  = arg_parser.get_str("b_layout");
+    auto persistent_dp    = arg_parser.get_bool("persistent_dp");
 
     if(data_type == "bf16")
     {
         using TypeConfig = StreamKGemmTypeConfig<ck_tile::bf16_t>;
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf16_t>, TypeConfig>(
-            a_layout, b_layout, argc, argv);
+        if(persistent_dp)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf16_t, true>, TypeConfig>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf16_t, false>, TypeConfig>(
+                a_layout, b_layout, argc, argv);
+        }
     }
     else if(data_type == "fp16")
     {
         using TypeConfig = StreamKGemmTypeConfig<ck_tile::half_t>;
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t>, TypeConfig>(
-            a_layout, b_layout, argc, argv);
+        if(persistent_dp)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t, true>, TypeConfig>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::half_t, false>, TypeConfig>(
+                a_layout, b_layout, argc, argv);
+        }
     }
     else if(data_type == "fp8")
     {
         using TypeConfig = StreamKGemmTypeConfig<ck_tile::fp8_t, ck_tile::fp8_t, ck_tile::half_t>;
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>, TypeConfig>(
-            a_layout, b_layout, argc, argv);
+        if(persistent_dp)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t, true>, TypeConfig>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t, false>, TypeConfig>(
+                a_layout, b_layout, argc, argv);
+        }
     }
     else if(data_type == "bf8")
     {
         using TypeConfig = StreamKGemmTypeConfig<ck_tile::bf8_t, ck_tile::bf8_t, ck_tile::half_t>;
-        return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>, TypeConfig>(
-            a_layout, b_layout, argc, argv);
+        if(persistent_dp)
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t, true>, TypeConfig>(
+                a_layout, b_layout, argc, argv);
+        }
+        else
+        {
+            return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t, false>, TypeConfig>(
+                a_layout, b_layout, argc, argv);
+        }
     }
     else
     {
diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt
index a6cfcde86e..92ee0a4c31 100644
--- a/example/ck_tile/CMakeLists.txt
+++ b/example/ck_tile/CMakeLists.txt
@@ -25,7 +25,6 @@ add_subdirectory(22_gemm_multi_abd)
 add_subdirectory(35_batched_transpose)
 add_subdirectory(36_pooling)
 add_subdirectory(38_block_scale_gemm)
-add_subdirectory(39_copy)
 add_subdirectory(40_streamk_gemm)
 add_subdirectory(41_batched_contraction)
 
diff --git a/experimental/builder/include/ck_tile/builder/builder_utils.hpp b/experimental/builder/include/ck_tile/builder/builder_utils.hpp
index a69471c9ed..f16d96bec6 100644
--- a/experimental/builder/include/ck_tile/builder/builder_utils.hpp
+++ b/experimental/builder/include/ck_tile/builder/builder_utils.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -78,66 +78,4 @@ struct UnsupportedEnumValue
 {
 };
 
-// Helper functions to convert enums to strings
-constexpr std::string_view ConvDirectionToString(ConvDirection dir)
-{
-    switch(dir)
-    {
-    case ConvDirection::FORWARD: return "Forward";
-    case ConvDirection::BACKWARD_DATA: return "Backward Data";
-    case ConvDirection::BACKWARD_WEIGHT: return "Backward Weight";
-    default: return "Unknown";
-    }
-}
-
-constexpr std::string_view DataTypeToString(DataType dt)
-{
-    switch(dt)
-    {
-    case DataType::FP16: return "FP16";
-    case DataType::FP32: return "FP32";
-    case DataType::BF16: return "BF16";
-    case DataType::FP8: return "FP8";
-    case DataType::I8: return "I8";
-    case DataType::U8: return "U8";
-    default: return "Unknown";
-    }
-}
-
-constexpr std::string_view LayoutToString(GroupConvLayout1D layout)
-{
-    switch(layout)
-    {
-    case GroupConvLayout1D::GNWC_GKXC_GNWK: return "GNWC_GKXC_GNWK";
-    case GroupConvLayout1D::NWGC_GKXC_NWGK: return "NWGC_GKXC_NWGK";
-    case GroupConvLayout1D::NGCW_GKXC_NGKW: return "NGCW_GKXC_NGKW";
-    case GroupConvLayout1D::NGCW_GKCX_NGKW: return "NGCW_GKCX_NGKW";
-    default: return "Unknown";
-    }
-}
-
-constexpr std::string_view LayoutToString(GroupConvLayout2D layout)
-{
-    switch(layout)
-    {
-    case GroupConvLayout2D::GNHWC_GKYXC_GNHWK: return "GNHWC_GKYXC_GNHWK";
-    case GroupConvLayout2D::NHWGC_GKYXC_NHWGK: return "NHWGC_GKYXC_NHWGK";
-    case GroupConvLayout2D::NGCHW_GKYXC_NGKHW: return "NGCHW_GKYXC_NGKHW";
-    case GroupConvLayout2D::NGCHW_GKCYX_NGKHW: return "NGCHW_GKCYX_NGKHW";
-    default: return "Unknown";
-    }
-}
-
-constexpr std::string_view LayoutToString(GroupConvLayout3D layout)
-{
-    switch(layout)
-    {
-    case GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK: return "GNDHWC_GKZYXC_GNDHWK";
-    case GroupConvLayout3D::NDHWGC_GKZYXC_NDHWGK: return "NDHWGC_GKZYXC_NDHWGK";
-    case GroupConvLayout3D::NGCDHW_GKZYXC_NGKDHW: return "NGCDHW_GKZYXC_NGKDHW";
-    case GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW: return "NGCDHW_GKCZYX_NGKDHW";
-    default: return "Unknown";
-    }
-}
-
 } // namespace ck_tile::builder
diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp
index 586a119c75..6006efe4f8 100644
--- a/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp
+++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_concepts.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -38,8 +38,8 @@ concept GridwiseXdlGemmDescriptor = requires(T t) {
 // Concept for parameter that describe block GEMM problem.
 template <typename T>
 concept BlockGemmDescriptor = requires(T t) {
-    { t.pipeline_version } -> std::convertible_to<BlockGemmPipelineVersion>;
-    { t.scheduler } -> std::convertible_to<BlockGemmPipelineScheduler>;
+    { t.pipeline_version } -> std::convertible_to<PipelineVersion>;
+    { t.scheduler } -> std::convertible_to<PipelineScheduler>;
 };
 
 // Concept for parameters that describe a gridwise WMMA GEMM problem.
@@ -50,7 +50,7 @@ concept GridwiseWmmaGemmDescriptor = requires(T t) {
     { t.n_per_wmma } -> std::convertible_to<size_t>;
     { t.m_wmma_per_wave } -> std::convertible_to<size_t>;
     { t.n_wmma_per_wave } -> std::convertible_to<size_t>;
-    { t.pipeline_version } -> std::convertible_to<GridwiseGemmPipelineVersion>;
+    { t.pipeline_version } -> std::convertible_to<PipelineVersion>;
 };
 
 // Concept for vectorized data transfer for convolution input tensors.
@@ -154,8 +154,8 @@ concept SpecifiesSourceAccessOrder = requires(T t) {
 // Concept to check if struct specifies block GEMM.
 template <typename T>
 concept SpecifiesBlockGemm = requires {
-    { T::block_gemm.pipeline_version } -> std::convertible_to<BlockGemmPipelineVersion>;
-    { T::block_gemm.scheduler } -> std::convertible_to<BlockGemmPipelineScheduler>;
+    { T::block_gemm.pipeline_version } -> std::convertible_to<PipelineVersion>;
+    { T::block_gemm.scheduler } -> std::convertible_to<PipelineScheduler>;
 };
 
 template <typename T>
@@ -180,7 +180,90 @@ concept SpecifiesNumGroupsToMerge = requires {
 
 template <typename T>
 concept SpecifiesLoopScheduler = requires {
-    { T::loop_scheduler } -> std::convertible_to<LoopScheduler>;
+    { T::loop_scheduler } -> std::convertible_to<PipelineScheduler>;
+};
+
+/******************************************** */
+/* DL-specific descriptors and requirements   */
+/******************************************** */
+
+// Concept for DL thread configuration
+template <typename T>
+concept DlThreadConfigDescriptor = requires(T t) {
+    { t.k0_per_block } -> std::convertible_to<size_t>;
+    { t.k1 } -> std::convertible_to<size_t>;
+    { t.m1_per_thread } -> std::convertible_to<size_t>;
+    { t.n1_per_thread } -> std::convertible_to<size_t>;
+    { t.k_per_thread } -> std::convertible_to<size_t>;
+};
+
+// Concept for DL thread cluster
+template <typename T>
+concept DlThreadClusterDescriptor = requires(T t) {
+    { t.m1_xs } -> std::convertible_to<std::array<size_t, 2>>;
+    { t.n1_xs } -> std::convertible_to<std::array<size_t, 2>>;
+};
+
+// Concept for DL block transfer K0_M0_M1_K1 format
+template <typename T>
+concept DlBlockTransferK0M0M1K1Descriptor = requires(T t) {
+    { t.thread_slice_lengths } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.thread_cluster_lengths } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.thread_cluster_arrange_order } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.src_access_order } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.src_vector_tensor_lengths } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.src_vector_tensor_contiguous_dim_order } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.dst_vector_tensor_lengths } -> std::convertible_to<std::array<size_t, 4>>;
+};
+
+// Concept for DL block transfer K0_N0_N1_K1 format
+template <typename T>
+concept DlBlockTransferK0N0N1K1Descriptor = requires(T t) {
+    { t.thread_slice_lengths } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.thread_cluster_lengths } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.thread_cluster_arrange_order } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.src_access_order } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.src_vector_tensor_lengths } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.src_vector_tensor_contiguous_dim_order } -> std::convertible_to<std::array<size_t, 4>>;
+    { t.dst_vector_tensor_lengths } -> std::convertible_to<std::array<size_t, 4>>;
+};
+
+// Concept for DL C thread transfer
+template <typename T>
+concept DlCThreadTransferDescriptor = requires(T t) {
+    { t.src_dst_access_order } -> std::convertible_to<std::array<size_t, 6>>;
+    { t.src_dst_vector_dim } -> std::convertible_to<size_t>;
+    { t.dst_scalar_per_vector } -> std::convertible_to<size_t>;
+};
+
+// Concept to check if algorithm specifies DL thread config
+template <typename T>
+concept SpecifiesDlThreadConfig = requires {
+    { T::dl_thread_config } -> DlThreadConfigDescriptor;
+};
+
+// Concept to check if algorithm specifies DL thread cluster
+template <typename T>
+concept SpecifiesDlThreadCluster = requires {
+    { T::dl_thread_cluster } -> DlThreadClusterDescriptor;
+};
+
+// Concept to check if algorithm specifies DL A block transfer
+template <typename T>
+concept SpecifiesDlBlockTransferA = requires {
+    { T::dl_block_transfer_a } -> DlBlockTransferK0M0M1K1Descriptor;
+};
+
+// Concept to check if algorithm specifies DL B block transfer
+template <typename T>
+concept SpecifiesDlBlockTransferB = requires {
+    { T::dl_block_transfer_b } -> DlBlockTransferK0N0N1K1Descriptor;
+};
+
+// Concept to check if algorithm specifies DL C thread transfer
+template <typename T>
+concept SpecifiesDlCThreadTransfer = requires {
+    { T::dl_c_thread_transfer } -> DlCThreadTransferDescriptor;
 };
 
 } // namespace ck_tile::builder
diff --git a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp
index 68d5ec5a83..b24d0f47e9 100644
--- a/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp
+++ b/experimental/builder/include/ck_tile/builder/conv_algorithm_limits.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/experimental/builder/include/ck_tile/builder/conv_builder.hpp b/experimental/builder/include/ck_tile/builder/conv_builder.hpp
index d74948709b..bfb7386ea1 100644
--- a/experimental/builder/include/ck_tile/builder/conv_builder.hpp
+++ b/experimental/builder/include/ck_tile/builder/conv_builder.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/experimental/builder/include/ck_tile/builder/conv_factory.hpp b/experimental/builder/include/ck_tile/builder/conv_factory.hpp
index 8ea3e18d65..e40199987d 100644
--- a/experimental/builder/include/ck_tile/builder/conv_factory.hpp
+++ b/experimental/builder/include/ck_tile/builder/conv_factory.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 // A factory for instantiating CK convolution kernels.
 //
@@ -36,9 +36,21 @@
 
 #pragma once
 
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp"
+// WORKAROUND: Macro namespace collision in upstream CK device operation headers.
+// device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp (line 41) and
+// device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp (line 51) both define
+// GridwiseGemmTemplateParameters macro without #undef, causing redefinition errors.
+// Use pragma push/pop to isolate the Large_Tensor header's macro scope.
+#pragma push_macro("GridwiseGemmTemplateParameters")
+#ifdef GridwiseGemmTemplateParameters
+#undef GridwiseGemmTemplateParameters
+#endif
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp"
+#pragma pop_macro("GridwiseGemmTemplateParameters")
 #include "ck_tile/builder/conv_signature_concepts.hpp"
 #include "ck_tile/builder/conv_algorithm_concepts.hpp"
 #include "ck_tile/builder/conv_algorithm_limits.hpp"
@@ -297,42 +309,42 @@ constexpr BlockGemmSpec SetBlockGemm()
     ck::BlockGemmPipelineScheduler scheduler;
     ck::BlockGemmPipelineVersion version;
 
-    if constexpr(BG.scheduler == BlockGemmPipelineScheduler::INTRAWAVE)
+    if constexpr(BG.scheduler == PipelineScheduler::INTRAWAVE)
     {
         scheduler = ck::BlockGemmPipelineScheduler::Intrawave;
     }
-    else if constexpr(BG.scheduler == BlockGemmPipelineScheduler::INTERWAVE)
+    else if constexpr(BG.scheduler == PipelineScheduler::INTERWAVE)
     {
         scheduler = ck::BlockGemmPipelineScheduler::Interwave;
     }
     else
     {
-        static_assert(false, "Unknown BlockGemmPipelineScheduler");
+        static_assert(false, "Unknown PipelineScheduler");
     }
 
-    if constexpr(BG.pipeline_version == BlockGemmPipelineVersion::V1)
+    if constexpr(BG.pipeline_version == PipelineVersion::V1)
     {
         version = ck::BlockGemmPipelineVersion::v1;
     }
-    else if constexpr(BG.pipeline_version == BlockGemmPipelineVersion::V2)
+    else if constexpr(BG.pipeline_version == PipelineVersion::V2)
     {
         version = ck::BlockGemmPipelineVersion::v2;
     }
-    else if constexpr(BG.pipeline_version == BlockGemmPipelineVersion::V3)
+    else if constexpr(BG.pipeline_version == PipelineVersion::V3)
     {
         version = ck::BlockGemmPipelineVersion::v3;
     }
-    else if constexpr(BG.pipeline_version == BlockGemmPipelineVersion::V4)
+    else if constexpr(BG.pipeline_version == PipelineVersion::V4)
     {
         version = ck::BlockGemmPipelineVersion::v4;
     }
-    else if constexpr(BG.pipeline_version == BlockGemmPipelineVersion::V5)
+    else if constexpr(BG.pipeline_version == PipelineVersion::V5)
     {
         version = ck::BlockGemmPipelineVersion::v5;
     }
     else
     {
-        static_assert(false, "Unknown BlockGemmPipelineVersion");
+        static_assert(false, "Unknown PipelineVersion");
     }
 
     return BlockGemmSpec{.pipeline_version = version, .scheduler = scheduler};
@@ -442,17 +454,17 @@ consteval ck::LoopScheduler SetLoopScheduler()
 {
     constexpr auto loop_scheduler = ALGORITHM.loop_scheduler;
 
-    if constexpr(loop_scheduler == LoopScheduler::DEFAULT)
+    if constexpr(loop_scheduler == PipelineScheduler::DEFAULT)
     {
         return ck::LoopScheduler::Default;
     }
-    else if constexpr(loop_scheduler == LoopScheduler::INTERWAVE)
+    else if constexpr(loop_scheduler == PipelineScheduler::INTERWAVE)
     {
         return ck::LoopScheduler::Interwave;
     }
     else
     {
-        static_assert(false, "Unknown LoopScheduler");
+        static_assert(false, "Unknown PipelineScheduler");
     }
 }
 
@@ -460,29 +472,29 @@ template <ConvAlgorithmDescriptor auto ALGORITHM>
 consteval ck::PipelineVersion SetGridwiseGemmPipelineVersion()
 {
     constexpr auto pipeline_version = ALGORITHM.gridwise_gemm.pipeline_version;
-    if constexpr(pipeline_version == GridwiseGemmPipelineVersion::V1)
+    if constexpr(pipeline_version == PipelineVersion::V1)
     {
         return ck::PipelineVersion::v1;
     }
-    else if constexpr(pipeline_version == GridwiseGemmPipelineVersion::V2)
+    else if constexpr(pipeline_version == PipelineVersion::V2)
     {
         return ck::PipelineVersion::v2;
     }
-    else if constexpr(pipeline_version == GridwiseGemmPipelineVersion::V3)
+    else if constexpr(pipeline_version == PipelineVersion::V3)
     {
         static_assert(false, "V3 is used only for stream-K.");
     }
-    else if constexpr(pipeline_version == GridwiseGemmPipelineVersion::V4)
+    else if constexpr(pipeline_version == PipelineVersion::V4)
     {
         return ck::PipelineVersion::v4;
     }
-    else if constexpr(pipeline_version == GridwiseGemmPipelineVersion::WEIGHT_ONLY)
+    else if constexpr(pipeline_version == PipelineVersion::WEIGHT_ONLY)
     {
         return ck::PipelineVersion::weight_only;
     }
     else
     {
-        static_assert(false, "Unknown GridwiseGemmPipelineVersion");
+        static_assert(false, "Unknown PipelineVersion");
     }
 }
 
@@ -566,29 +578,29 @@ consteval ck::BlockGemmPipelineVersion SetBlockGemmPipelineVersion()
 {
     constexpr auto version = ALGORITHM.pipeline_version;
 
-    if constexpr(version == BlockGemmPipelineVersion::V1)
+    if constexpr(version == PipelineVersion::V1)
     {
         return ck::BlockGemmPipelineVersion::v1;
     }
-    else if constexpr(version == BlockGemmPipelineVersion::V2)
+    else if constexpr(version == PipelineVersion::V2)
     {
         return ck::BlockGemmPipelineVersion::v2;
     }
-    else if constexpr(version == BlockGemmPipelineVersion::V3)
+    else if constexpr(version == PipelineVersion::V3)
     {
         return ck::BlockGemmPipelineVersion::v3;
     }
-    else if constexpr(version == BlockGemmPipelineVersion::V4)
+    else if constexpr(version == PipelineVersion::V4)
     {
         return ck::BlockGemmPipelineVersion::v4;
     }
-    else if constexpr(version == BlockGemmPipelineVersion::V5)
+    else if constexpr(version == PipelineVersion::V5)
     {
         return ck::BlockGemmPipelineVersion::v5;
     }
     else
     {
-        static_assert(false, "Unknown BlockGemmPipelineVersion");
+        static_assert(false, "Unknown PipelineVersion");
     }
 }
 
@@ -990,4 +1002,263 @@ struct ConvFactory<SIGNATURE, ALGORITHM, VERSION>
         GRIDWISE_GEMM_PIPELINE_VERSION>;
 };
 
+// Factory specialization for DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK instance
+// of a grouped forward convolution kernel using Direct Load (DL) approach.
+template <ConvSignatureDescriptor auto SIGNATURE,
+          ConvAlgorithmDescriptor auto ALGORITHM,
+          StringLiteral VERSION>
+    requires ConvDirectionIsForward<SIGNATURE> &&
+             ConvDeviceOpIs_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<SIGNATURE>
+struct ConvFactory<SIGNATURE, ALGORITHM, VERSION>
+{
+    static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
+    using Layouts       = decltype(factory_internal::GetTensorLayout<SIGNATURE.layout,
+                                                                     SPATIAL_DIM,
+                                                                     ConvDirection::FORWARD>());
+    using Types         = factory_internal::ConvTensorTypes<SIGNATURE.data_type>;
+    using Ops           = factory_internal::ElementwiseOps<SIGNATURE.elementwise_operation>;
+    using AlgorithmType = decltype(ALGORITHM);
+
+    static_assert(SpecifiesThreadBlock<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify thread block info.");
+    static_assert(SpecifiesFwdConcSpecialization<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify forward convolution "
+                  "specialization.");
+    static_assert(SpecifiesGemmSpecialization<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify gemm specialization.");
+    static_assert(SpecifiesDlThreadConfig<AlgorithmType>,
+                  "DL algorithm must specify thread config.");
+    static_assert(SpecifiesDlThreadCluster<AlgorithmType>,
+                  "DL algorithm must specify thread cluster.");
+    static_assert(SpecifiesDlBlockTransferA<AlgorithmType>,
+                  "DL algorithm must specify A block transfer.");
+    static_assert(SpecifiesDlBlockTransferB<AlgorithmType>,
+                  "DL algorithm must specify B block transfer.");
+    static_assert(SpecifiesDlCThreadTransfer<AlgorithmType>,
+                  "DL algorithm must specify C thread transfer.");
+
+    static constexpr auto FWD_CONV_SPECIALIZATION =
+        factory_internal::SetFwdConvSpecialization<ALGORITHM>();
+    static constexpr auto GEMM_SPECIALIZATION =
+        factory_internal::SetGemmSpecialization<ALGORITHM>();
+
+    static constexpr auto BLOCK = factory_internal::SetThreadBlockInfo<ALGORITHM>();
+
+    // DL-specific parameters from algorithm descriptor
+    static constexpr auto DL_THREAD_CFG      = ALGORITHM.dl_thread_config;
+    static constexpr ck::index_t K0PerBlock  = DL_THREAD_CFG.k0_per_block;
+    static constexpr ck::index_t K1          = DL_THREAD_CFG.k1;
+    static constexpr ck::index_t M1PerThread = DL_THREAD_CFG.m1_per_thread;
+    static constexpr ck::index_t N1PerThread = DL_THREAD_CFG.n1_per_thread;
+    static constexpr ck::index_t KPerThread  = DL_THREAD_CFG.k_per_thread;
+
+    // Thread cluster from descriptor
+    static constexpr auto DL_CLUSTER = ALGORITHM.dl_thread_cluster;
+    using M1N1ThreadClusterM1Xs      = to_sequence_v<DL_CLUSTER.m1_xs>;
+    using M1N1ThreadClusterN1Xs      = to_sequence_v<DL_CLUSTER.n1_xs>;
+
+    // A Block Transfer from descriptor - K0_M0_M1_K1 tensor format
+    static constexpr auto DL_A_TRANSFER = ALGORITHM.dl_block_transfer_a;
+    using ABlockTransferThreadSliceLengths_K0_M0_M1_K1 =
+        to_sequence_v<DL_A_TRANSFER.thread_slice_lengths>;
+    using ABlockTransferThreadClusterLengths_K0_M0_M1_K1 =
+        to_sequence_v<DL_A_TRANSFER.thread_cluster_lengths>;
+    using ABlockTransferThreadClusterArrangeOrder =
+        to_sequence_v<DL_A_TRANSFER.thread_cluster_arrange_order>;
+    using ABlockTransferSrcAccessOrder = to_sequence_v<DL_A_TRANSFER.src_access_order>;
+    using ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1 =
+        to_sequence_v<DL_A_TRANSFER.src_vector_tensor_lengths>;
+    using ABlockTransferSrcVectorTensorContiguousDimOrder =
+        to_sequence_v<DL_A_TRANSFER.src_vector_tensor_contiguous_dim_order>;
+    using ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1 =
+        to_sequence_v<DL_A_TRANSFER.dst_vector_tensor_lengths>;
+
+    // B Block Transfer from descriptor - K0_N0_N1_K1 tensor format
+    static constexpr auto DL_B_TRANSFER = ALGORITHM.dl_block_transfer_b;
+    using BBlockTransferThreadSliceLengths_K0_N0_N1_K1 =
+        to_sequence_v<DL_B_TRANSFER.thread_slice_lengths>;
+    using BBlockTransferThreadClusterLengths_K0_N0_N1_K1 =
+        to_sequence_v<DL_B_TRANSFER.thread_cluster_lengths>;
+    using BBlockTransferThreadClusterArrangeOrder =
+        to_sequence_v<DL_B_TRANSFER.thread_cluster_arrange_order>;
+    using BBlockTransferSrcAccessOrder = to_sequence_v<DL_B_TRANSFER.src_access_order>;
+    using BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1 =
+        to_sequence_v<DL_B_TRANSFER.src_vector_tensor_lengths>;
+    using BBlockTransferSrcVectorTensorContiguousDimOrder =
+        to_sequence_v<DL_B_TRANSFER.src_vector_tensor_contiguous_dim_order>;
+    using BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1 =
+        to_sequence_v<DL_B_TRANSFER.dst_vector_tensor_lengths>;
+
+    // C Thread Transfer from descriptor
+    static constexpr auto DL_C_TRANSFER    = ALGORITHM.dl_c_thread_transfer;
+    using CThreadTransferSrcDstAccessOrder = to_sequence_v<DL_C_TRANSFER.src_dst_access_order>;
+    static constexpr ck::index_t CThreadTransferSrcDstVectorDim = DL_C_TRANSFER.src_dst_vector_dim;
+    static constexpr ck::index_t CThreadTransferDstScalarPerVector =
+        DL_C_TRANSFER.dst_scalar_per_vector;
+
+    // The DL forward convolution kernel class instance
+    using Instance = ck::tensor_operation::device::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
+        SPATIAL_DIM,
+        typename Types::ADataType,
+        typename Types::BDataType,
+        typename Types::DsDataTypes,
+        typename Types::EDataType,
+        typename Types::AccDataType,
+        typename Layouts::ALayout,
+        typename Layouts::BLayout,
+        typename Layouts::DsLayout,
+        typename Layouts::ELayout,
+        typename Ops::AElementwiseOp,
+        typename Ops::BElementwiseOp,
+        typename Ops::CDEElementwiseOp,
+        FWD_CONV_SPECIALIZATION,
+        GEMM_SPECIALIZATION,
+        BLOCK.block_size,
+        BLOCK.per_block.m,
+        BLOCK.per_block.n,
+        K0PerBlock,
+        K1,
+        M1PerThread,
+        N1PerThread,
+        KPerThread,
+        M1N1ThreadClusterM1Xs,
+        M1N1ThreadClusterN1Xs,
+        ABlockTransferThreadSliceLengths_K0_M0_M1_K1,
+        ABlockTransferThreadClusterLengths_K0_M0_M1_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorTensorLengths_K0_M0_M1_K1,
+        ABlockTransferSrcVectorTensorContiguousDimOrder,
+        ABlockTransferDstVectorTensorLengths_K0_M0_M1_K1,
+        BBlockTransferThreadSliceLengths_K0_N0_N1_K1,
+        BBlockTransferThreadClusterLengths_K0_N0_N1_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorTensorLengths_K0_N0_N1_K1,
+        BBlockTransferSrcVectorTensorContiguousDimOrder,
+        BBlockTransferDstVectorTensorLengths_K0_N0_N1_K1,
+        CThreadTransferSrcDstAccessOrder,
+        CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector>;
+};
+
+// Factory specialization for DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor instance
+// of a grouped forward convolution kernel with large tensor support (N-splitting).
+template <ConvSignatureDescriptor auto SIGNATURE,
+          ConvAlgorithmDescriptor auto ALGORITHM,
+          StringLiteral VERSION>
+    requires ConvDirectionIsForward<SIGNATURE> &&
+             ConvDeviceOpIs_DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<SIGNATURE>
+struct ConvFactory<SIGNATURE, ALGORITHM, VERSION>
+{
+    static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
+    using Layouts       = decltype(factory_internal::GetTensorLayout<SIGNATURE.layout,
+                                                                     SPATIAL_DIM,
+                                                                     ConvDirection::FORWARD>());
+    using Types         = factory_internal::ConvTensorTypes<SIGNATURE.data_type>;
+    using Ops           = factory_internal::ElementwiseOps<SIGNATURE.elementwise_operation>;
+    using AlgorithmType = decltype(ALGORITHM);
+
+    static_assert(SpecifiesThreadBlock<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify thread block info.");
+    static_assert(SpecifiesGridwiseXdlGemm<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify gridwise GEMM info.");
+    static_assert(SpecifiesBlockTransfer<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify block transfer info.");
+    static_assert(SpecifiesLdsTransfer<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify LDS transfer info.");
+    static_assert(
+        SpecifiesThreadClusterAccessOrder<AlgorithmType>,
+        "The convolution algorithm descriptor must specify thread cluster access order info.");
+    static_assert(SpecifiesSourceAccessOrder<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify source access order info.");
+    static_assert(SpecifiesFwdConcSpecialization<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify forward convolution "
+                  "specialization.");
+    static_assert(SpecifiesGemmSpecialization<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify gemm specialization.");
+    static_assert(SpecifiesNumPrefetchStages<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify number of prefetch stages.");
+    static_assert(SpecifiesLoopScheduler<AlgorithmType>,
+                  "The convolution algorithm descriptor must specify loop scheduler.");
+
+    static constexpr auto FWD_CONV_SPECIALIZATION =
+        factory_internal::SetFwdConvSpecialization<ALGORITHM>();
+    static constexpr auto GEMM_SPECIALIZATION =
+        factory_internal::SetGemmSpecialization<ALGORITHM>();
+    static constexpr factory_internal::ConvSpec SPECIALIZATION{.conv_spec = FWD_CONV_SPECIALIZATION,
+                                                               .gemm_spec = GEMM_SPECIALIZATION};
+
+    static constexpr auto LOOP_SCHEDULER = factory_internal::SetLoopScheduler<ALGORITHM>();
+    static constexpr auto BLOCK          = factory_internal::SetThreadBlockInfo<ALGORITHM>();
+    static constexpr auto GRIDWISE_GEMM  = ALGORITHM.gridwise_gemm;
+    static constexpr auto A_BLOCK_TRANSFER =
+        factory_internal::SetFwdConvABlockTransfer<ALGORITHM>();
+    static constexpr auto B_BLOCK_TRANSFER =
+        factory_internal::SetFwdConvBBlockTransfer<ALGORITHM>();
+    static constexpr auto C_BLOCK_TRANSFER =
+        factory_internal::SetCBlockTransfer<SIGNATURE, ALGORITHM>();
+
+    // Check limits for the algorithm parameters.
+    static_assert(InputVectorTransferLimits<A_BLOCK_TRANSFER>);
+    static_assert(InputVectorTransferLimits<B_BLOCK_TRANSFER>);
+    static_assert(OutputVectorTransferLimits<C_BLOCK_TRANSFER>);
+    static_assert(AccessOrderLimits<A_BLOCK_TRANSFER.thread_cluster_order>);
+    static_assert(AccessOrderLimits<B_BLOCK_TRANSFER.thread_cluster_order>);
+    static_assert(AccessOrderLimits<A_BLOCK_TRANSFER.src_access_order>);
+    static_assert(AccessOrderLimits<B_BLOCK_TRANSFER.src_access_order>);
+
+    // The forward convolution kernel class instance with large tensor support.
+    using Instance =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<
+            SPATIAL_DIM,
+            typename Layouts::ALayout,
+            typename Layouts::BLayout,
+            typename Layouts::DsLayout,
+            typename Layouts::ELayout,
+            typename Types::ADataType,
+            typename Types::BDataType,
+            typename Types::AccDataType,
+            typename Types::CShuffleDataType,
+            typename Types::DsDataTypes,
+            typename Types::EDataType,
+            typename Ops::AElementwiseOp,
+            typename Ops::BElementwiseOp,
+            typename Ops::CDEElementwiseOp,
+            SPECIALIZATION.conv_spec,
+            SPECIALIZATION.gemm_spec,
+            ALGORITHM.num_gemm_k_prefetch_stages,
+            BLOCK.block_size,
+            BLOCK.per_block.m,
+            BLOCK.per_block.n,
+            BLOCK.per_block.k,
+            GRIDWISE_GEMM.ak1,
+            GRIDWISE_GEMM.bk1,
+            GRIDWISE_GEMM.m_per_xdl,
+            GRIDWISE_GEMM.n_per_xdl,
+            GRIDWISE_GEMM.m_xdl_per_wave,
+            GRIDWISE_GEMM.n_xdl_per_wave,
+            to_sequence_v<A_BLOCK_TRANSFER.thread_cluster_dims>,
+            to_sequence_v<A_BLOCK_TRANSFER.thread_cluster_order>,
+            to_sequence_v<A_BLOCK_TRANSFER.src_access_order>,
+            A_BLOCK_TRANSFER.src_vector_dim,
+            A_BLOCK_TRANSFER.src_scalar_per_vector,
+            A_BLOCK_TRANSFER.lds_dst_scalar_per_vector,
+            A_BLOCK_TRANSFER.lds_padding,
+            to_sequence_v<B_BLOCK_TRANSFER.thread_cluster_dims>,
+            to_sequence_v<B_BLOCK_TRANSFER.thread_cluster_order>,
+            to_sequence_v<B_BLOCK_TRANSFER.src_access_order>,
+            B_BLOCK_TRANSFER.src_vector_dim,
+            B_BLOCK_TRANSFER.src_scalar_per_vector,
+            B_BLOCK_TRANSFER.lds_dst_scalar_per_vector,
+            B_BLOCK_TRANSFER.lds_padding,
+            C_BLOCK_TRANSFER.m_per_wave_per_shuffle,
+            C_BLOCK_TRANSFER.n_per_wave_per_shuffle,
+            to_sequence_v<C_BLOCK_TRANSFER.thread_cluster_dims>,
+            C_BLOCK_TRANSFER.scalar_per_vector,
+            typename Types::AComputeType,
+            typename Types::BComputeType,
+            LOOP_SCHEDULER>;
+};
+
 } // namespace ck_tile::builder
diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp
index 370e7b6521..742dfbb89c 100644
--- a/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp
+++ b/experimental/builder/include/ck_tile/builder/conv_signature_concepts.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 // This file defines the compile-time "signature" for grouped convolution operations.
 // A signature is a collection of properties that fully describe a convolution kernel's
diff --git a/experimental/builder/include/ck_tile/builder/conv_signature_predicates.hpp b/experimental/builder/include/ck_tile/builder/conv_signature_predicates.hpp
index f947c7e329..3869c7b538 100644
--- a/experimental/builder/include/ck_tile/builder/conv_signature_predicates.hpp
+++ b/experimental/builder/include/ck_tile/builder/conv_signature_predicates.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -33,30 +33,35 @@ concept ConvDirectionIsBackwardWeight = (Sig.direction == ConvDirection::BACKWAR
 // Predicate for DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 =
+    ConvDirectionIsForward<Sig> &&
     (Sig.device_operation._fwd ==
      FwdGroupConvDeviceOperation::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3);
 
 // Predicate for DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK =
+    ConvDirectionIsForward<Sig> &&
     (Sig.device_operation._fwd ==
      FwdGroupConvDeviceOperation::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK);
 
 // Predicate for DeviceGroupedConvFwdMultipleD_Wmma_CShuffle operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvFwdMultipleD_Wmma_CShuffle =
+    ConvDirectionIsForward<Sig> &&
     (Sig.device_operation._fwd ==
      FwdGroupConvDeviceOperation::DeviceGroupedConvFwdMultipleD_Wmma_CShuffle);
 
 // Predicate for DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle =
+    ConvDirectionIsForward<Sig> &&
     (Sig.device_operation._fwd ==
      FwdGroupConvDeviceOperation::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle);
 
 // Predicate for DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor =
+    ConvDirectionIsForward<Sig> &&
     (Sig.device_operation._fwd ==
      FwdGroupConvDeviceOperation::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor);
 
@@ -76,48 +81,56 @@ concept ConvDeviceOpIsForward =
 // Predicate for DeviceGroupedConvBwdWeight operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdWeight =
+    ConvDirectionIsBackwardWeight<Sig> &&
     (Sig.device_operation._bwd_weight ==
      BwdWeightGroupConvDeviceOperation::DeviceGroupedConvBwdWeight);
 
 // Predicate for DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle =
+    ConvDirectionIsBackwardWeight<Sig> &&
     (Sig.device_operation._bwd_weight ==
      BwdWeightGroupConvDeviceOperation::DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle);
 
 // Predicate for DeviceGroupedConvBwdWeight_Xdl_CShuffle operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdWeight_Xdl_CShuffle =
+    ConvDirectionIsBackwardWeight<Sig> &&
     (Sig.device_operation._bwd_weight ==
      BwdWeightGroupConvDeviceOperation::DeviceGroupedConvBwdWeight_Xdl_CShuffle);
 
 // Predicate for DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle =
+    ConvDirectionIsBackwardWeight<Sig> &&
     (Sig.device_operation._bwd_weight ==
      BwdWeightGroupConvDeviceOperation::DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle);
 
 // Predicate for DeviceGroupedConvBwdWeight_Wmma_CShuffle operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdWeight_Wmma_CShuffle =
+    ConvDirectionIsBackwardWeight<Sig> &&
     (Sig.device_operation._bwd_weight ==
      BwdWeightGroupConvDeviceOperation::DeviceGroupedConvBwdWeight_Wmma_CShuffle);
 
 // Predicate for DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 =
+    ConvDirectionIsBackwardWeight<Sig> &&
     (Sig.device_operation._bwd_weight ==
      BwdWeightGroupConvDeviceOperation::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3);
 
 // Predicate for DeviceGroupedConvBwdWeightMultipleD operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdWeightMultipleD =
+    ConvDirectionIsBackwardWeight<Sig> &&
     (Sig.device_operation._bwd_weight ==
      BwdWeightGroupConvDeviceOperation::DeviceGroupedConvBwdWeightMultipleD);
 
 // Predicate for DeviceGroupedConvBwdWeight_Dl operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdWeight_Dl =
+    ConvDirectionIsBackwardWeight<Sig> &&
     (Sig.device_operation._bwd_weight ==
      BwdWeightGroupConvDeviceOperation::DeviceGroupedConvBwdWeight_Dl);
 
@@ -140,18 +153,21 @@ concept ConvDeviceOpIsBackwardWeight =
 // Predicate for DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1 =
+    ConvDirectionIsBackwardData<Sig> &&
     (Sig.device_operation._bwd_data ==
      BwdDataGroupConvDeviceOperation::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1);
 
 // Predicate for DeviceGroupedConvBwdDataMultipleD operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdDataMultipleD =
+    ConvDirectionIsBackwardData<Sig> &&
     (Sig.device_operation._bwd_data ==
      BwdDataGroupConvDeviceOperation::DeviceGroupedConvBwdDataMultipleD);
 
 // Predicate for DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle operation.
 template <auto Sig>
 concept ConvDeviceOpIs_DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle =
+    ConvDirectionIsBackwardData<Sig> &&
     (Sig.device_operation._bwd_data ==
      BwdDataGroupConvDeviceOperation::DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle);
 
diff --git a/experimental/builder/include/ck_tile/builder/device_op_types.hpp b/experimental/builder/include/ck_tile/builder/device_op_types.hpp
new file mode 100644
index 0000000000..0e779fdf4e
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/device_op_types.hpp
@@ -0,0 +1,22 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+namespace ck_tile::builder {
+
+// Enumeration for CK Device Operation types.
+// This allows the builder to select which device operation template to instantiate
+// based on the user's requirements.
+enum class DeviceOpType
+{
+    // Forward Convolution - Non-grouped
+    CONV_FWD, // Maps to: DeviceConvFwd (TODO: No implementation with tuning params exists yet)
+
+    // Forward Convolution - Grouped
+    GROUPED_CONV_FWD_MULTIPLE_ABD, // Maps to: DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
+    GROUPED_CONV_FWD_MULTIPLE_ABD_XDL_CSHUFFLE_V3, // Maps to:
+                                                   // DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
+};
+
+} // namespace ck_tile::builder
diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp
new file mode 100644
index 0000000000..0b58f5a3b7
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/reflect/conv_description.hpp
@@ -0,0 +1,268 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <concepts>
+#include <string_view>
+#include <sstream>
+#include <type_traits>
+#include <variant>
+
+#include <ck_tile/builder/conv_signature_concepts.hpp>
+#include <ck_tile/builder/reflect/conv_traits.hpp>
+#include <ck_tile/builder/reflect/tree_formatter.hpp>
+
+/// @file conv_description.hpp
+/// @brief Provides human-readable descriptions of ConvBuilder configurations
+
+namespace ck_tile::reflect::conv {
+
+struct ConvSignatureInfo
+{
+    int spatial_dim;
+    builder::ConvDirection direction;
+    std::variant<builder::GroupConvLayout1D, builder::GroupConvLayout2D, builder::GroupConvLayout3D>
+        layout;
+    builder::DataType data_type;
+    builder::ElementwiseOperation input_element_op;
+    builder::ElementwiseOperation weight_element_op;
+    builder::ElementwiseOperation output_element_op;
+};
+
+// Algorithm information - groups all algorithm-related configuration
+struct GemmAlgorithmInfo
+{
+    int thread_block_size;
+    DataTileInfo tile_dims;
+    WarpGemmParams warp_gemm;
+    InputTileTransferInfo a_tile_transfer;
+    InputTileTransferInfo b_tile_transfer;
+    OutputTileTransferInfo c_tile_transfer;
+    builder::PipelineVersion pipeline_version;
+    builder::PipelineScheduler pipeline_scheduler;
+    std::variant<builder::ConvFwdSpecialization,
+                 builder::ConvBwdDataSpecialization,
+                 builder::ConvBwdWeightSpecialization>
+        conv_specialization;
+    builder::GemmPadding padding;
+};
+
+// Provides human-readable descriptions of ConvBuilder configurations.
+struct ConvDescription
+{
+    ConvSignatureInfo signature;
+    GemmAlgorithmInfo algorithm;
+
+    // Brief one-line summary
+    std::string brief() const
+    {
+        std::ostringstream oss;
+        oss << signature.spatial_dim << "D " << signature.direction << " convolution";
+        return oss.str();
+    }
+
+    // Detailed hierarchical description
+    std::string detailed() const
+    {
+        TreeFormatter f;
+        f.writeLine(0, signature.spatial_dim, "D ", signature.direction, " Convolution Kernel");
+        f.writeLine(1, "Signature");
+        f.writeLine(2, "Tensor Type: ", signature.data_type);
+        f.writeLine(2, "Memory Layout: ", signature.layout);
+        f.writeLine(2, "Input elementwise operation: ", signature.input_element_op);
+        f.writeLine(2, "Weights elementwise operation: ", signature.weight_element_op);
+        f.writeLast(2, "Output elementwise operation: ", signature.output_element_op);
+
+        f.writeLine(1, "Algorithm");
+        // Compute Block section
+        f.writeLine(2, "Thread block size: ", algorithm.thread_block_size);
+        f.writeLine(2,
+                    "Data tile size: ",
+                    algorithm.tile_dims.m,
+                    "×",
+                    algorithm.tile_dims.n,
+                    "×",
+                    algorithm.tile_dims.k);
+        f.writeLine(2, "Gemm padding: ", algorithm.padding);
+        f.writeLine(2, "Convolution specialization: ", algorithm.conv_specialization);
+        // Pipeline section
+        f.writeLine(2, "Pipeline version: ", algorithm.pipeline_version);
+        f.writeLine(2, "Pipeline scheduler: ", algorithm.pipeline_scheduler);
+        f.writeLine(2, "Warp Gemm parameters: ");
+        f.writeLine(
+            3, "subtile size: ", algorithm.warp_gemm.gemm_m, "×", algorithm.warp_gemm.gemm_n);
+        f.writeLast(3,
+                    "Number of warp gemm iterations: ",
+                    algorithm.warp_gemm.m_iter,
+                    "×",
+                    algorithm.warp_gemm.n_iter);
+
+        // Memory Access section
+        f.writeLine(2, "Memory access:");
+
+        f.writeLine(3, "A Tile transfer: ");
+        f.writeLine(4,
+                    "Tile dimensions: ",
+                    algorithm.a_tile_transfer.tile_dimensions.k0,
+                    "×",
+                    algorithm.a_tile_transfer.tile_dimensions.m_or_n,
+                    "×",
+                    algorithm.a_tile_transfer.tile_dimensions.k1,
+                    "×");
+        f.writeLine(
+            4, "The innermost K subdimension size: ", algorithm.a_tile_transfer.transfer_params.k1);
+        f.writeLine(4,
+                    "Spatial thread distribution over the data tile: ",
+                    algorithm.a_tile_transfer.transfer_params.thread_cluster_order[0],
+                    "×",
+                    algorithm.a_tile_transfer.transfer_params.thread_cluster_order[1],
+                    "×",
+                    algorithm.a_tile_transfer.transfer_params.thread_cluster_order[2]);
+        f.writeLine(4,
+                    "The order of accessing data tile axes: ",
+                    algorithm.a_tile_transfer.transfer_params.src_access_order[0],
+                    "×",
+                    algorithm.a_tile_transfer.transfer_params.src_access_order[1],
+                    "×",
+                    algorithm.a_tile_transfer.transfer_params.src_access_order[2]);
+        f.writeLine(4,
+                    "Vectorized memory access axis index (with contiguous memory): ",
+                    algorithm.a_tile_transfer.transfer_params.src_vector_dim);
+        f.writeLine(4,
+                    "Vector access (GMEM read) instruction size: ",
+                    algorithm.a_tile_transfer.transfer_params.src_scalar_per_vector);
+        f.writeLine(4,
+                    "Vector access (LDS write) instruction size: ",
+                    algorithm.a_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
+        f.writeLast(4,
+                    "LDS data layout padding (to prevent bank conflicts): ",
+                    algorithm.a_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
+
+        f.writeLine(3, "B Tile transfer: ");
+        f.writeLine(4,
+                    "Tile dimensions: ",
+                    algorithm.b_tile_transfer.tile_dimensions.k0,
+                    "×",
+                    algorithm.b_tile_transfer.tile_dimensions.m_or_n,
+                    "×",
+                    algorithm.b_tile_transfer.tile_dimensions.k1,
+                    "×");
+        f.writeLine(
+            4, "The innermost K subdimension size: ", algorithm.b_tile_transfer.transfer_params.k1);
+        f.writeLine(4,
+                    "Spatial thread distribution over the data tile: ",
+                    algorithm.b_tile_transfer.transfer_params.thread_cluster_order[0],
+                    "×",
+                    algorithm.b_tile_transfer.transfer_params.thread_cluster_order[1],
+                    "×",
+                    algorithm.b_tile_transfer.transfer_params.thread_cluster_order[2]);
+        f.writeLine(4,
+                    "The order of accessing data tile axes: ",
+                    algorithm.b_tile_transfer.transfer_params.src_access_order[0],
+                    "×",
+                    algorithm.b_tile_transfer.transfer_params.src_access_order[1],
+                    "×",
+                    algorithm.b_tile_transfer.transfer_params.src_access_order[2]);
+        f.writeLine(4,
+                    "Vectorized memory access axis index (with contiguous memory): ",
+                    algorithm.b_tile_transfer.transfer_params.src_vector_dim);
+        f.writeLine(4,
+                    "Vector access (GMEM read) instruction size: ",
+                    algorithm.b_tile_transfer.transfer_params.src_scalar_per_vector);
+        f.writeLine(4,
+                    "Vector access (LDS write) instruction size: ",
+                    algorithm.b_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
+        f.writeLast(4,
+                    "LDS data layout padding (to prevent bank conflicts): ",
+                    algorithm.b_tile_transfer.transfer_params.dst_scalar_per_vector_k1);
+
+        f.writeLast(3, "C Tile transfer: ");
+        f.writeLine(4,
+                    "Data shuffle (number of gemm instructions per iteration): ",
+                    algorithm.c_tile_transfer.shuffle_params.m_gemms_per_shuffle,
+                    "×",
+                    algorithm.c_tile_transfer.shuffle_params.n_gemms_per_shuffle);
+        f.writeLine(4,
+                    "Spatial thread distribution used to store data: ",
+                    algorithm.c_tile_transfer.thread_cluster_dims[0],
+                    "×",
+                    algorithm.c_tile_transfer.thread_cluster_dims[1],
+                    "×",
+                    algorithm.c_tile_transfer.thread_cluster_dims[2],
+                    "×",
+                    algorithm.c_tile_transfer.thread_cluster_dims[3]);
+        f.writeLast(4,
+                    "Vector access (GMEM write) instruction size: ",
+                    algorithm.c_tile_transfer.scalar_per_vector);
+        f.writeLast(2);
+        f.writeLast(1);
+        return f.getString();
+    }
+
+    // Educational explanation of optimization choices
+    std::string explain() const
+    {
+        std::ostringstream oss;
+        // Placeholder for future implementation
+        return oss.str();
+    }
+
+    // Performance characteristics and use case guidance
+    std::string suggest() const
+    {
+        std::ostringstream oss;
+        // Placeholder for future implementation
+        return oss.str();
+    }
+};
+
+// Helper concept to detect if a type has InstanceTraits specialization
+template <typename T>
+concept HasInstanceTraits = requires { typename InstanceTraits<T>; };
+
+// Helper concept to detect ConvBuilder types
+template <typename T>
+concept IsConvBuilder = requires {
+    typename T::Factory;
+    typename T::Instance;
+};
+
+// Primary factory function: Create ConvDescription from Instance type directly
+template <typename Instance>
+    requires HasInstanceTraits<Instance>
+ConvDescription Describe()
+{
+    using Traits = ConvTraits<Instance>;
+
+    return ConvDescription{
+        .signature = ConvSignatureInfo{.spatial_dim       = Traits::spatial_dim,
+                                       .direction         = Traits::direction,
+                                       .layout            = Traits::layout,
+                                       .data_type         = Traits::data_type,
+                                       .input_element_op  = Traits::input_element_op,
+                                       .weight_element_op = Traits::weight_element_op,
+                                       .output_element_op = Traits::output_element_op},
+        .algorithm = GemmAlgorithmInfo{.thread_block_size   = Traits::thread_block_size,
+                                       .tile_dims           = Traits::tile_dims,
+                                       .warp_gemm           = Traits::warp_gemm,
+                                       .a_tile_transfer     = Traits::a_tile_transfer,
+                                       .b_tile_transfer     = Traits::b_tile_transfer,
+                                       .c_tile_transfer     = Traits::c_tile_transfer,
+                                       .pipeline_version    = Traits::pipeline_version,
+                                       .pipeline_scheduler  = Traits::pipeline_scheduler,
+                                       .conv_specialization = Traits::conv_specialization,
+                                       .padding             = Traits::gemm_padding}};
+}
+
+// Backward compatibility: Create ConvDescription from Builder type
+template <typename Builder>
+    requires IsConvBuilder<Builder> && (!HasInstanceTraits<Builder>)
+ConvDescription Describe()
+{
+    // Delegate to Instance-based version
+    using Instance = typename Builder::Instance;
+    return Describe<Instance>();
+}
+
+} // namespace ck_tile::reflect::conv
diff --git a/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp
new file mode 100644
index 0000000000..4b946011c2
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/reflect/conv_traits.hpp
@@ -0,0 +1,722 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <concepts>
+#include <ck_tile/builder/conv_builder.hpp>
+#include <ck_tile/builder/conv_factory.hpp>
+#include <ck_tile/builder/conv_signature_concepts.hpp>
+#include <ck_tile/builder/reflect/instance_traits.hpp>
+#include <ck_tile/builder/types.hpp>
+#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
+#include <ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp>
+#include <ck/utility/loop_scheduler.hpp>
+#include <ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp>
+#include <ck_tile/ops/gemm.hpp>
+#include "ck_tile/ops/epilogue.hpp"
+#include <ck_tile/ops/grouped_convolution.hpp>
+
+namespace ck_tile::reflect::conv {
+
+// Helper metafunctions to convert from ck enums to builder enums
+
+/// @brief Converts a CK BlockGemmPipelineVersion enum to a builder PipelineVersion enum.
+/// @tparam ck_ver The CK BlockGemmPipelineVersion enum value to convert.
+/// @return The corresponding builder::PipelineVersion enum value (V1, V2, V3, V4, or V5).
+/// @details This function maps CK's block GEMM pipeline version identifiers to the
+/// builder framework's standardized pipeline version enum. The pipeline version
+/// determines the strategy used for data movement and computation overlap in the
+/// GEMM kernel's main loop.
+template <ck::BlockGemmPipelineVersion ck_ver>
+constexpr auto convert_pipeline_version()
+{
+    using enum ck::BlockGemmPipelineVersion;
+    using enum builder::PipelineVersion;
+    if constexpr(ck_ver == v1)
+        return V1;
+    else if constexpr(ck_ver == v2)
+        return V2;
+    else if constexpr(ck_ver == v3)
+        return V3;
+    else if constexpr(ck_ver == v4)
+        return V4;
+    else if constexpr(ck_ver == v5)
+        return V5;
+}
+
+/// @brief Converts a CK PipelineVersion enum to a builder PipelineVersion enum.
+/// @tparam ck_ver The CK PipelineVersion enum value to convert.
+/// @return The corresponding builder::PipelineVersion enum value (V1, V2, V4, or WEIGHT_ONLY).
+/// @details This function maps CK's general pipeline version identifiers to the
+/// builder framework's standardized pipeline version enum. Note that this overload
+/// handles a different set of pipeline versions compared to the BlockGemmPipelineVersion
+/// variant, including support for specialized weight-only pipelines.
+template <ck::PipelineVersion ck_ver>
+constexpr auto convert_pipeline_version()
+{
+    using enum ck::PipelineVersion;
+    using enum builder::PipelineVersion;
+    if constexpr(ck_ver == v1)
+        return V1;
+    else if constexpr(ck_ver == v2)
+        return V2;
+    else if constexpr(ck_ver == v4)
+        return V4;
+    else if constexpr(ck_ver == weight_only)
+        return WEIGHT_ONLY;
+}
+
+/// @brief Converts a CK BlockGemmPipelineScheduler enum to a builder PipelineScheduler enum.
+/// @tparam ck_sched The CK BlockGemmPipelineScheduler enum value to convert.
+/// @return The corresponding builder::PipelineScheduler enum value (INTRAWAVE or INTERWAVE).
+/// @details This function maps CK's block GEMM pipeline scheduler identifiers to the
+/// builder framework's standardized scheduler enum. The scheduler determines how work
+/// is distributed and synchronized within and across wavefronts during pipeline execution.
+/// INTRAWAVE scheduling operates within a single wavefront, while INTERWAVE coordinates
+/// across multiple wavefronts.
+template <ck::BlockGemmPipelineScheduler ck_sched>
+constexpr auto convert_pipeline_scheduler()
+{
+    using enum ck::BlockGemmPipelineScheduler;
+    using enum builder::PipelineScheduler;
+    if constexpr(ck_sched == Intrawave)
+        return INTRAWAVE;
+    else if constexpr(ck_sched == Interwave)
+        return INTERWAVE;
+}
+
+/// @brief Converts a CK LoopScheduler enum to a builder PipelineScheduler enum.
+/// @tparam ck_sched The CK LoopScheduler enum value to convert.
+/// @return The corresponding builder::PipelineScheduler enum value (DEFAULT or INTERWAVE).
+/// @details This function maps CK's loop scheduler identifiers to the builder framework's
+/// standardized pipeline scheduler enum. The loop scheduler controls how iterations of
+/// the main computational loop are scheduled across threads. DEFAULT uses the standard
+/// scheduling strategy, while INTERWAVE enables cross-wavefront coordination for improved
+/// performance in certain scenarios.
+template <ck::LoopScheduler ck_sched>
+constexpr auto convert_pipeline_scheduler()
+{
+    using enum ck::LoopScheduler;
+    using enum builder::PipelineScheduler;
+    if constexpr(ck_sched == Default)
+        return DEFAULT;
+    else if constexpr(ck_sched == Interwave)
+        return INTERWAVE;
+}
+
+/// @brief Helper structures for organizing trait data with domain-specific naming
+
+/// @brief Data tile dimensions processed by a workgroup.
+/// @details This struct defines the M, N, and K dimensions of the data tile
+/// that a single workgroup (thread block) is responsible for processing in the
+/// underlying GEMM computation.
+struct DataTileInfo
+{
+    int m; ///< M dimension of the tile processed by the workgroup (MPerBlock).
+    int n; ///< N dimension of the tile processed by the workgroup (NPerBlock).
+    int k; ///< K dimension of the tile processed by the workgroup (KPerBlock).
+};
+
+/// @brief Dimensions for an input data tile transfer.
+/// @details Defines the shape of the input tile (A or B matrix) as it is
+/// transferred from global memory to LDS. The tile is conceptually divided
+/// into k0 and k1 dimensions.
+struct InputTileTransferDimensions
+{
+    int k0;     ///< The outer dimension of K, where K = k0 * k1.
+    int m_or_n; ///< The M dimension for the A matrix transfer, or the N dimension for the B matrix.
+    int k1; ///< The inner dimension of K, often corresponding to the vector load size from global
+            ///< memory.
+};
+
+/// @brief Parameters governing the transfer of an input tile.
+/// @details This struct holds configuration details for how an input tile is
+/// loaded from global memory into LDS, including thread clustering, memory
+/// access patterns, and vectorization settings.
+struct InputTileTransferParams
+{
+    int k1; ///< The inner K dimension size, often matching the vectorization width.
+    std::array<int, 3>
+        thread_cluster_dims; ///< Spatial thread distribution over the input data tile; defines how
+                             ///< many threads are arranged on each axis.
+    std::array<int, 3> thread_cluster_order; ///< The order of thread spatial distribution over the
+                                             ///< input tensor dimensions.
+    std::array<int, 3> src_access_order; ///< The order of accessing input tensor axes (e.g., which
+                                         ///< dimension to read first).
+    int src_vector_dim; ///< The index of the axis on which vectorized memory access is performed
+                        ///< (the contiguous dimension).
+    int src_scalar_per_vector;    ///< The size of the vector access instruction; the number of
+                                  ///< elements accessed per thread per instruction.
+    int dst_scalar_per_vector_k1; ///< The size of the vectorized store into LDS memory along the K1
+                                  ///< dimension.
+    bool lds_padding; ///< Flag indicating if padding is used for the LDS tensor to prevent bank
+                      ///< conflicts.
+};
+
+/// @brief Complete information for an input tile transfer.
+/// @details Combines the dimensional information and transfer parameters for
+/// a full description of an input tile's journey from global memory to LDS.
+struct InputTileTransferInfo
+{
+    InputTileTransferDimensions tile_dimensions; ///< The shape and layout of the tile.
+    InputTileTransferParams transfer_params; ///< The parameters for the memory transfer operation.
+};
+
+/// @brief Parameters for the warp-level GEMM computation.
+/// @details Defines the configuration of the GEMM operation performed by each
+/// warp using hardware MFMA (Matrix Fused Multiply-Add) instructions.
+struct WarpGemmParams
+{
+    int gemm_m; ///< The M dimension of a single MFMA instruction (MPerXdl).
+    int gemm_n; ///< The N dimension of a single MFMA instruction (NPerXdl).
+    int m_iter; ///< The number of MFMA iterations along the M dimension of the output tile per
+                ///< wavefront (MXdlPerWave).
+    int n_iter; ///< The number of MFMA iterations along the N dimension of the output tile per
+                ///< wavefront (NXdlPerWave).
+};
+
+/// @brief Parameters for shuffling data between warps (CShuffle optimization).
+/// @details Configures how many MFMA instruction results are processed per
+/// wave in each iteration of the CShuffle routine.
+struct WarpShuffleParams
+{
+    int m_gemms_per_shuffle; ///< Number of MFMA results along the M dimension to process per wave
+                             ///< per shuffle iteration.
+    int n_gemms_per_shuffle; ///< Number of MFMA results along the N dimension to process per wave
+                             ///< per shuffle iteration.
+};
+
+/// @brief Information for the output tile transfer (CShuffle).
+/// @details Describes how the final computed tile (C matrix) is written out from
+/// LDS to global memory, including shuffling, thread clustering, and vectorization.
+struct OutputTileTransferInfo
+{
+    WarpShuffleParams shuffle_params; ///< Configuration for cross-warp data shuffling.
+    // m_block, m_wave_per_xdl, n_block, n_wave_per_xdl
+    std::array<int, 4> thread_cluster_dims; ///< The spatial thread distribution used for storing
+                                            ///< data into the output tensor.
+    int scalar_per_vector; ///< The size of the vectorized memory access when storing data to the
+                           ///< output tensor.
+};
+
+// Helper metafunctions to derive signature information from Instance types
+
+/// @brief Derives the convolution direction from a device kernel `Instance` type.
+/// @tparam Instance The device kernel instance type.
+/// @return A `builder::ConvDirection` enum value (FORWARD, BACKWARD_DATA, or BACKWARD_WEIGHT).
+template <typename Instance>
+constexpr builder::ConvDirection conv_direction()
+{
+    using InstTraits = InstanceTraits<Instance>;
+
+    if constexpr(requires { &InstTraits::kConvForwardSpecialization; })
+    {
+        return builder::ConvDirection::FORWARD;
+    }
+    else if constexpr(requires { &InstTraits::kConvBwdDataSpecialization; })
+    {
+        return builder::ConvDirection::BACKWARD_DATA;
+    }
+    else if constexpr(requires { &InstTraits::kConvBwdWeightSpecialization; })
+    {
+        return builder::ConvDirection::BACKWARD_WEIGHT;
+    }
+    else
+    {
+        return builder::ConvDirection::FORWARD; // Default fallback
+    }
+}
+
+/// @brief Derives the convolution-specific specialization from a device kernel `Instance` type.
+/// @tparam Instance The device kernel instance type.
+/// @return A `builder::ConvFwdSpecialization`, `builder::ConvBwdDataSpecialization`, or
+/// `builder::ConvBwdWeightSpecialization` enum value.
+template <typename Instance>
+constexpr auto conv_spec()
+{
+    using InstTraits = InstanceTraits<Instance>;
+
+    if constexpr(requires { InstTraits::kConvForwardSpecialization; })
+    {
+        using enum ck::tensor_operation::device::ConvolutionForwardSpecialization;
+
+        if constexpr(InstTraits::kConvForwardSpecialization == Default)
+        {
+            return builder::ConvFwdSpecialization::DEFAULT;
+        }
+        else if constexpr(InstTraits::kConvForwardSpecialization == Filter1x1Pad0)
+        {
+            return builder::ConvFwdSpecialization::FILTER_1X1_PAD0;
+        }
+        else if constexpr(InstTraits::kConvForwardSpecialization == Filter1x1Stride1Pad0)
+        {
+            return builder::ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0;
+        }
+        else if constexpr(InstTraits::kConvForwardSpecialization == Filter3x3)
+        {
+            return builder::ConvFwdSpecialization::FILTER_3x3;
+        }
+    }
+    else if constexpr(requires { InstTraits::kConvBwdDataSpecialization; })
+    {
+        using enum ck::tensor_operation::device::ConvolutionBackwardDataSpecialization;
+
+        if constexpr(InstTraits::kConvBwdDataSpecialization == Default)
+        {
+            return builder::ConvBwdDataSpecialization::DEFAULT;
+        }
+        else if constexpr(InstTraits::kConvBwdDataSpecialization == Filter1x1Stride1Pad0)
+        {
+            return builder::ConvBwdDataSpecialization::FILTER_1X1_STRIDE1_PAD0;
+        }
+    }
+    else if constexpr(requires { InstTraits::kConvBwdWeightSpecialization; })
+    {
+        using enum ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization;
+
+        if constexpr(InstTraits::kConvBwdWeightSpecialization == Default)
+        {
+            return builder::ConvBwdWeightSpecialization::DEFAULT;
+        }
+        else if constexpr(InstTraits::kConvBwdWeightSpecialization == Filter1x1Stride1Pad0)
+        {
+            return builder::ConvBwdWeightSpecialization::FILTER_1X1_STRIDE1_PAD0;
+        }
+        else if constexpr(InstTraits::kConvBwdWeightSpecialization == Filter1x1Pad0)
+        {
+            return builder::ConvBwdWeightSpecialization::FILTER_1X1_PAD0;
+        }
+        else if constexpr(InstTraits::kConvBwdWeightSpecialization == OddC)
+        {
+            return builder::ConvBwdWeightSpecialization::ODD_C;
+        }
+    }
+}
+
+/// @brief Derives the grouped convolution layout from a device kernel `Instance` type.
+/// @tparam Instance The device kernel instance type.
+/// @return A `builder::GroupConvLayout{1D|2D|3D}` enum value corresponding to the tensor layouts.
+template <typename Instance>
+constexpr auto conv_layout()
+{
+    using InstTraits = InstanceTraits<Instance>;
+    using ALayout    = typename InstTraits::ALayout;
+    using BLayout    = typename InstTraits::BLayout;
+    using ELayout    = typename InstTraits::ELayout;
+
+    namespace ctc = ck::tensor_layout::convolution;
+
+    if constexpr(InstTraits::kSpatialDim == 1)
+    {
+        if constexpr(std::is_same_v<ALayout, ctc::GNWC> && std::is_same_v<BLayout, ctc::GKXC> &&
+                     std::is_same_v<ELayout, ctc::GNWK>)
+        {
+            return builder::GroupConvLayout1D::GNWC_GKXC_GNWK;
+        }
+        else if constexpr(std::is_same_v<ALayout, ctc::NWGC> &&
+                          std::is_same_v<BLayout, ctc::GKXC> && std::is_same_v<ELayout, ctc::NWGK>)
+        {
+            return builder::GroupConvLayout1D::NWGC_GKXC_NWGK;
+        }
+        else if constexpr(std::is_same_v<ALayout, ctc::NGCW> &&
+                          std::is_same_v<BLayout, ctc::GKXC> && std::is_same_v<ELayout, ctc::NGKW>)
+        {
+            return builder::GroupConvLayout1D::NGCW_GKXC_NGKW;
+        }
+        else if constexpr(std::is_same_v<ALayout, ctc::NGCW> &&
+                          std::is_same_v<BLayout, ctc::GKCX> && std::is_same_v<ELayout, ctc::NGKW>)
+        {
+            return builder::GroupConvLayout1D::NGCW_GKCX_NGKW;
+        }
+    }
+    else if constexpr(InstTraits::kSpatialDim == 2)
+    {
+        if constexpr(std::is_same_v<ALayout, ctc::GNHWC> && std::is_same_v<BLayout, ctc::GKYXC> &&
+                     std::is_same_v<ELayout, ctc::GNHWK>)
+        {
+            return builder::GroupConvLayout2D::GNHWC_GKYXC_GNHWK;
+        }
+        else if constexpr(std::is_same_v<ALayout, ctc::NHWGC> &&
+                          std::is_same_v<BLayout, ctc::GKYXC> &&
+                          std::is_same_v<ELayout, ctc::NHWGK>)
+        {
+            return builder::GroupConvLayout2D::NHWGC_GKYXC_NHWGK;
+        }
+        else if constexpr(std::is_same_v<ALayout, ctc::NGCHW> &&
+                          std::is_same_v<BLayout, ctc::GKYXC> &&
+                          std::is_same_v<ELayout, ctc::NGKHW>)
+        {
+            return builder::GroupConvLayout2D::NGCHW_GKYXC_NGKHW;
+        }
+        else if constexpr(std::is_same_v<ALayout, ctc::NGCHW> &&
+                          std::is_same_v<BLayout, ctc::GKCYX> &&
+                          std::is_same_v<ELayout, ctc::NGKHW>)
+        {
+            return builder::GroupConvLayout2D::NGCHW_GKCYX_NGKHW;
+        }
+    }
+    else if constexpr(InstTraits::kSpatialDim == 3)
+    {
+        if constexpr(std::is_same_v<ALayout, ctc::GNDHWC> && std::is_same_v<BLayout, ctc::GKZYXC> &&
+                     std::is_same_v<ELayout, ctc::GNDHWK>)
+        {
+            return builder::GroupConvLayout3D::GNDHWC_GKZYXC_GNDHWK;
+        }
+        else if constexpr(std::is_same_v<ALayout, ctc::NDHWGC> &&
+                          std::is_same_v<BLayout, ctc::GKZYXC> &&
+                          std::is_same_v<ELayout, ctc::NDHWGK>)
+        {
+            return builder::GroupConvLayout3D::NDHWGC_GKZYXC_NDHWGK;
+        }
+        else if constexpr(std::is_same_v<ALayout, ctc::NGCDHW> &&
+                          std::is_same_v<BLayout, ctc::GKZYXC> &&
+                          std::is_same_v<ELayout, ctc::NGKDHW>)
+        {
+            return builder::GroupConvLayout3D::NGCDHW_GKZYXC_NGKDHW;
+        }
+        else if constexpr(std::is_same_v<ALayout, ctc::NGCDHW> &&
+                          std::is_same_v<BLayout, ctc::GKCZYX> &&
+                          std::is_same_v<ELayout, ctc::NGKDHW>)
+        {
+            return builder::GroupConvLayout3D::NGCDHW_GKCZYX_NGKDHW;
+        }
+    }
+}
+
+/// @brief Derives the data type from a device kernel `Instance` type.
+/// @tparam Instance The device kernel instance type.
+/// @return A `builder::DataType` enum value (e.g., FP16, BF16, FP32).
+template <typename Instance>
+constexpr builder::DataType conv_data_type()
+{
+    using InstTraits = InstanceTraits<Instance>;
+    using ADataType  = typename InstTraits::ADataType;
+
+    if constexpr(std::is_same_v<ADataType, ck::half_t>)
+    {
+        return builder::DataType::FP16;
+    }
+    else if constexpr(std::is_same_v<ADataType, ck::bhalf_t>)
+    {
+        return builder::DataType::BF16;
+    }
+    else if constexpr(std::is_same_v<ADataType, float>)
+    {
+        return builder::DataType::FP32;
+    }
+    else if constexpr(std::is_same_v<ADataType, ck::f8_t>)
+    {
+        return builder::DataType::FP8;
+    }
+    else if constexpr(std::is_same_v<ADataType, int8_t>)
+    {
+        return builder::DataType::I8;
+    }
+    else if constexpr(std::is_same_v<ADataType, uint8_t>)
+    {
+        return builder::DataType::U8;
+    }
+    else
+    {
+        // Default fallback
+        return builder::DataType::FP32;
+    }
+}
+
+/// @brief Derives the elementwise operation from op type.
+/// @tparam ElementwiseOp Elementwise operation functor type.
+/// @return A `builder::ElementwiseOperation` enum value corresponding to elementwise operation.
+template <typename ElementwiseOp>
+constexpr builder::ElementwiseOperation elementwise_op()
+{
+    constexpr std::string_view name = detail::elementwise_op_name<ElementwiseOp>();
+    if constexpr(detail::case_insensitive_equal(name, "Bias"))
+    {
+        return builder::ElementwiseOperation::BIAS;
+    }
+    else if constexpr(detail::case_insensitive_equal(name, "BiasClamp"))
+    {
+        return builder::ElementwiseOperation::BIAS_CLAMP;
+    }
+    else if constexpr(detail::case_insensitive_equal(name, "BiasBnormClamp"))
+    {
+        return builder::ElementwiseOperation::BIAS_BNORM_CLAMP;
+    }
+    else if constexpr(detail::case_insensitive_equal(name, "Bilinear"))
+    {
+        return builder::ElementwiseOperation::BILINEAR;
+    }
+    else if constexpr(detail::case_insensitive_equal(name, "Clamp"))
+    {
+        return builder::ElementwiseOperation::CLAMP;
+    }
+    else if constexpr(detail::case_insensitive_equal(name, "Scale"))
+    {
+        return builder::ElementwiseOperation::SCALE;
+    }
+    else if constexpr(detail::case_insensitive_equal(name, "PassThrough"))
+    {
+        return builder::ElementwiseOperation::PASS_THROUGH;
+    }
+}
+
+/// @brief Derives a gemm padding from a kernel instance type.
+/// @tparam Instance - A Device Kernel object type.
+/// @return A `builder::GemmPadding` enum value corresponding to kernel padding.
+template <typename Instance>
+constexpr builder::GemmPadding gemm_spec()
+{
+    using InstTraits = InstanceTraits<Instance>;
+    using enum builder::GemmPadding;
+    using enum ck::tensor_operation::device::GemmSpecialization;
+
+    constexpr auto gemm_spec = InstTraits::kGemmSpecialization;
+
+    if constexpr(gemm_spec == Default)
+    {
+        return DEFAULT;
+    }
+    else if constexpr(gemm_spec == MPadding)
+    {
+        return M_PADDING;
+    }
+    else if constexpr(gemm_spec == NPadding)
+    {
+        return N_PADDING;
+    }
+    else if constexpr(gemm_spec == KPadding)
+    {
+        return K_PADDING;
+    }
+    else if constexpr(gemm_spec == MNPadding)
+    {
+        return MN_PADDING;
+    }
+    else if constexpr(gemm_spec == MKPadding)
+    {
+        return MK_PADDING;
+    }
+    else if constexpr(gemm_spec == NKPadding)
+    {
+        return NK_PADDING;
+    }
+    else if constexpr(gemm_spec == MNKPadding)
+    {
+        return MNK_PADDING;
+    }
+    else if constexpr(gemm_spec == OPadding)
+    {
+        return O_PADDING;
+    }
+    else if constexpr(gemm_spec == MOPadding)
+    {
+        return MO_PADDING;
+    }
+    else if constexpr(gemm_spec == NOPadding)
+    {
+        return NO_PADDING;
+    }
+    else if constexpr(gemm_spec == KOPadding)
+    {
+        return KO_PADDING;
+    }
+    else if constexpr(gemm_spec == MNOPadding)
+    {
+        return MNO_PADDING;
+    }
+    else if constexpr(gemm_spec == MKOPadding)
+    {
+        return MKO_PADDING;
+    }
+    else if constexpr(gemm_spec == NKOPadding)
+    {
+        return NKO_PADDING;
+    }
+    else if constexpr(gemm_spec == MNKOPadding)
+    {
+        return MNKO_PADDING;
+    }
+}
+
+/// @brief Primary template for extracting convolution traits.
+/// @details This struct is the main entry point for reflecting on a convolution
+/// kernel's properties. It is specialized to handle different kinds of input types.
+template <typename T>
+struct ConvTraits;
+
+/// @brief Specialization of `ConvTraits` for a direct device kernel `Instance`.
+/// @details This is the primary specialization used to extract a comprehensive
+/// set of traits directly from a fully-formed device kernel `Instance` type.
+/// It uses `InstanceTraits` to access the kernel's template parameters.
+template <typename Instance>
+    requires requires { typename InstanceTraits<Instance>; }
+struct ConvTraits<Instance>
+{
+    using InstTraits = InstanceTraits<Instance>;
+
+    // --- Signature Information ---
+    /// @brief The number of spatial dimensions in the convolution (1, 2, or 3).
+    static constexpr int spatial_dim = InstTraits::kSpatialDim;
+    /// @brief The direction of the convolution (Forward, Backward Data, or Backward Weight).
+    static constexpr builder::ConvDirection direction = conv_direction<Instance>();
+    /// @brief The memory layout of the convolution tensors (e.g., GNHWC_GKYXC_GNHWK).
+    static constexpr auto layout = conv_layout<Instance>();
+    /// @brief The primary data type used in the computation (e.g., FP16, FP32).
+    static constexpr builder::DataType data_type = conv_data_type<Instance>();
+
+    static constexpr builder::ElementwiseOperation input_element_op =
+        elementwise_op<typename InstTraits::AElementwiseOperation>();
+    static constexpr builder::ElementwiseOperation weight_element_op =
+        elementwise_op<typename InstTraits::BElementwiseOperation>();
+    static constexpr builder::ElementwiseOperation output_element_op =
+        elementwise_op<typename InstTraits::CDEElementwiseOperation>();
+
+    /// @brief The GEMM specialization used by the kernel - padding
+    static constexpr auto gemm_padding = gemm_spec<Instance>();
+    /// @brief The convolution-specific specialization (e.g., Default, 1x1).
+    static constexpr auto conv_specialization = conv_spec<Instance>();
+
+    // --- Algorithm Information ---
+    /// @brief The total number of threads in a thread block (workgroup).
+    static constexpr int thread_block_size = InstTraits::kBlockSize;
+    /// @brief The dimensions of the data tile processed by the thread block.
+    static constexpr DataTileInfo tile_dims = {
+        .m = InstTraits::kMPerBlock, .n = InstTraits::kNPerBlock, .k = InstTraits::kKPerBlock};
+
+    /// @brief Configuration for the A-matrix (input) tile transfer.
+    static constexpr InputTileTransferInfo a_tile_transfer = {
+        .tile_dimensions = {.k0     = InstTraits::kKPerBlock / InstTraits::kAK1,
+                            .m_or_n = InstTraits::kMPerBlock,
+                            .k1     = InstTraits::kAK1},
+        .transfer_params = {.k1                    = InstTraits::kAK1,
+                            .thread_cluster_dims   = InstTraits::kAThreadClusterLengths,
+                            .thread_cluster_order  = InstTraits::kAThreadClusterArrangeOrder,
+                            .src_access_order      = InstTraits::kABlockTransferSrcAccessOrder,
+                            .src_vector_dim        = InstTraits::kABlockTransferSrcVectorDim,
+                            .src_scalar_per_vector = InstTraits::kABlockTransferSrcScalarPerVector,
+                            .dst_scalar_per_vector_k1 =
+                                InstTraits::kABlockTransferDstScalarPerVectorK1,
+                            .lds_padding = static_cast<bool>(InstTraits::kABlockLdsExtraM)}};
+
+    /// @brief Configuration for the B-matrix (weights) tile transfer.
+    static constexpr InputTileTransferInfo b_tile_transfer = {
+        .tile_dimensions = {.k0     = InstTraits::kKPerBlock / InstTraits::kBK1,
+                            .m_or_n = InstTraits::kNPerBlock,
+                            .k1     = InstTraits::kBK1},
+        .transfer_params = {.k1                    = InstTraits::kBK1,
+                            .thread_cluster_dims   = InstTraits::kBThreadClusterLengths,
+                            .thread_cluster_order  = InstTraits::kBThreadClusterArrangeOrder,
+                            .src_access_order      = InstTraits::kBBlockTransferSrcAccessOrder,
+                            .src_vector_dim        = InstTraits::kBBlockTransferSrcVectorDim,
+                            .src_scalar_per_vector = InstTraits::kBBlockTransferSrcScalarPerVector,
+                            .dst_scalar_per_vector_k1 =
+                                InstTraits::kBBlockTransferDstScalarPerVectorK1,
+                            .lds_padding = static_cast<bool>(InstTraits::kBBlockLdsExtraN)}};
+
+    /// @brief Parameters for the warp-level GEMM computation.
+    static constexpr WarpGemmParams warp_gemm = {.gemm_m = InstTraits::kMPerXDL,
+                                                 .gemm_n = InstTraits::kNPerXDL,
+                                                 .m_iter = InstTraits::kMXdlPerWave,
+                                                 .n_iter = InstTraits::kNXdlPerWave};
+
+    /// @brief Configuration for the C-matrix (output) tile transfer.
+    static constexpr OutputTileTransferInfo c_tile_transfer = {
+        .shuffle_params      = {.m_gemms_per_shuffle = InstTraits::kCShuffleMXdlPerWavePerShuffle,
+                                .n_gemms_per_shuffle = InstTraits::kCShuffleNXdlPerWavePerShuffle},
+        .thread_cluster_dims = {InstTraits::kCThreadClusterLengths[0],
+                                InstTraits::kCThreadClusterLengths[1],
+                                InstTraits::kCThreadClusterLengths[2],
+                                InstTraits::kCThreadClusterLengths[3]},
+        .scalar_per_vector   = InstTraits::kCBlockTransferScalarPerVector};
+
+    /// @brief Helper to safely get the pipeline version.
+    /// @details This is only available for some convolutions (e.g., forward).
+    /// If not present in `InstanceTraits`, it returns a default value.
+    template <typename T = InstTraits>
+    static constexpr auto get_pipeline_version()
+    {
+        if constexpr(requires { T::kPipelineVersion; })
+        {
+            return convert_pipeline_version<T::kPipelineVersion>();
+        }
+        else
+        {
+            // Return a default or indicate not available
+            return builder::PipelineVersion::V1;
+        }
+    }
+
+    /// @brief The block GEMM pipeline version used by the kernel.
+    static constexpr auto pipeline_version = get_pipeline_version();
+
+    /// @brief Helper to safely get the pipeline scheduler.
+    /// @details This is only available for some convolutions. If not present
+    /// in `InstanceTraits`, it returns a default value.
+    template <typename T = InstTraits>
+    static constexpr auto get_pipeline_scheduler()
+    {
+        if constexpr(requires { T::kPipelineScheduler; })
+        {
+            return convert_pipeline_scheduler<T::kPipelineScheduler>();
+        }
+        else if constexpr(requires { T::kLoopScheduler; })
+        {
+            return convert_pipeline_scheduler<T::kLoopScheduler>();
+        }
+        else
+        {
+            // Return a default or indicate not available
+            return builder::PipelineScheduler::DEFAULT;
+        }
+    }
+
+    /// @brief The pipeline scheduler used by the kernel.
+    static constexpr auto pipeline_scheduler = get_pipeline_scheduler();
+};
+
+/// @brief Specialization of `ConvTraits` for a `ConvBuilder` type.
+/// @details This specialization provides backward compatibility for reflecting
+/// on kernels defined via the `ConvBuilder` interface. It works by first
+/// creating the `Instance` via the builder's factory, and then delegating
+/// all trait extraction to the `ConvTraits<Instance>` specialization.
+template <builder::ConvSignatureDescriptor auto SIGNATURE,
+          builder::ConvAlgorithmDescriptor auto ALGORITHM,
+          builder::StringLiteral VERSION>
+struct ConvTraits<builder::ConvBuilder<SIGNATURE, ALGORITHM, VERSION>>
+{
+    using Factory  = builder::ConvFactory<SIGNATURE, ALGORITHM, VERSION>;
+    using Instance = typename Factory::Instance;
+
+    // Delegate to Instance-based ConvTraits
+    using InstanceConvTraits = ConvTraits<Instance>;
+
+    // Forward all members from Instance-based traits
+    static constexpr int spatial_dim                  = InstanceConvTraits::spatial_dim;
+    static constexpr builder::ConvDirection direction = InstanceConvTraits::direction;
+    static constexpr auto layout                      = InstanceConvTraits::layout;
+    static constexpr builder::DataType data_type      = InstanceConvTraits::data_type;
+
+    static constexpr builder::ElementwiseOperation input_element_op =
+        InstanceConvTraits::input_element_op;
+    static constexpr builder::ElementwiseOperation weight_element_op =
+        InstanceConvTraits::weight_element_op;
+    static constexpr builder::ElementwiseOperation output_element_op =
+        InstanceConvTraits::output_element_op;
+
+    static constexpr auto gemm_padding        = InstanceConvTraits::gemm_padding;
+    static constexpr auto conv_specialization = InstanceConvTraits::conv_specialization;
+
+    static constexpr int thread_block_size                  = InstanceConvTraits::thread_block_size;
+    static constexpr DataTileInfo tile_dims                 = InstanceConvTraits::tile_dims;
+    static constexpr InputTileTransferInfo a_tile_transfer  = InstanceConvTraits::a_tile_transfer;
+    static constexpr InputTileTransferInfo b_tile_transfer  = InstanceConvTraits::b_tile_transfer;
+    static constexpr WarpGemmParams warp_gemm               = InstanceConvTraits::warp_gemm;
+    static constexpr OutputTileTransferInfo c_tile_transfer = InstanceConvTraits::c_tile_transfer;
+    static constexpr auto pipeline_version                  = InstanceConvTraits::pipeline_version;
+    static constexpr auto pipeline_scheduler = InstanceConvTraits::pipeline_scheduler;
+};
+
+} // namespace ck_tile::reflect::conv
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits.hpp
index a47ad0ef57..07f1b94b07 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 // Compile-time reflection for CK device kernel instances.
 //
@@ -14,18 +14,9 @@
 
 #pragma once
 
-#include <array>
 #include <string>
-#include <sstream>
 #include <type_traits>
-#include <ck/utility/data_type.hpp>
-#include <ck/utility/sequence.hpp>
-#include <ck/utility/blkgemmpipe_scheduler.hpp>
-#include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
-#include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
-#include <ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp>
-#include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
-#include "instance_traits_util.hpp"
+#include <concepts>
 
 namespace ck_tile::reflect {
 
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
new file mode 100644
index 0000000000..6913889c4f
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -0,0 +1,286 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "instance_traits.hpp"
+#include "instance_traits_util.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
+
+// Forward declaration to avoid circular dependency
+namespace ck::tensor_operation::device {
+
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
+              ConvBackwardWeightSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CShuffleMXdlPerWavePerShuffle,
+          ck::index_t CShuffleNXdlPerWavePerShuffle,
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
+          typename ComputeTypeA,
+          typename ComputeTypeB,
+          ck::index_t MaxTransposeTransferSrcScalarPerVector,
+          ck::index_t MaxTransposeTransferDstScalarPerVector>
+struct DeviceGroupedConvBwdWeight_Xdl_CShuffle;
+
+} // namespace ck::tensor_operation::device
+
+namespace ck_tile {
+namespace reflect {
+
+template <ck::index_t NDimSpatial,
+          typename InLayout_,
+          typename WeiLayout_,
+          typename OutLayout_,
+          typename InDataType_,
+          typename WeiDataType_,
+          typename OutDataType_,
+          typename AccDataType_,
+          typename InElementwiseOperation_,
+          typename WeiElementwiseOperation_,
+          typename OutElementwiseOperation_,
+          ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization
+              ConvBackwardWeightSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXDL,
+          ck::index_t NPerXDL,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1_,
+          typename ABlockTransferThreadClusterArrangeOrder_,
+          typename ABlockTransferSrcAccessOrder_,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1_,
+          typename BBlockTransferThreadClusterArrangeOrder_,
+          typename BBlockTransferSrcAccessOrder_,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          ck::index_t CShuffleMXdlPerWavePerShuffle,
+          ck::index_t CShuffleNXdlPerWavePerShuffle,
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
+          ck::index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
+          typename ComputeTypeA_,
+          typename ComputeTypeB_,
+          ck::index_t MaxTransposeTransferSrcScalarPerVector,
+          ck::index_t MaxTransposeTransferDstScalarPerVector>
+struct InstanceTraits<ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
+    NDimSpatial,
+    InLayout_,
+    WeiLayout_,
+    OutLayout_,
+    InDataType_,
+    WeiDataType_,
+    OutDataType_,
+    AccDataType_,
+    InElementwiseOperation_,
+    WeiElementwiseOperation_,
+    OutElementwiseOperation_,
+    ConvBackwardWeightSpecialization,
+    BlockSize,
+    MPerBlock,
+    NPerBlock,
+    K0PerBlock,
+    K1,
+    MPerXDL,
+    NPerXDL,
+    MXdlPerWave,
+    NXdlPerWave,
+    ABlockTransferThreadClusterLengths_K0_M_K1_,
+    ABlockTransferThreadClusterArrangeOrder_,
+    ABlockTransferSrcAccessOrder_,
+    ABlockTransferSrcVectorDim,
+    ABlockTransferSrcScalarPerVector,
+    ABlockTransferDstScalarPerVector_K1,
+    ABlockLdsAddExtraM,
+    BBlockTransferThreadClusterLengths_K0_N_K1_,
+    BBlockTransferThreadClusterArrangeOrder_,
+    BBlockTransferSrcAccessOrder_,
+    BBlockTransferSrcVectorDim,
+    BBlockTransferSrcScalarPerVector,
+    BBlockTransferDstScalarPerVector_K1,
+    BBlockLdsAddExtraN,
+    CShuffleMXdlPerWavePerShuffle,
+    CShuffleNXdlPerWavePerShuffle,
+    CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_,
+    CBlockTransferScalarPerVector_NWaveNPerXdl,
+    ComputeTypeA_,
+    ComputeTypeB_,
+    MaxTransposeTransferSrcScalarPerVector,
+    MaxTransposeTransferDstScalarPerVector>>
+{
+    static constexpr auto kTensorOpName = "DeviceGroupedConvBwdWeight_Xdl_CShuffle";
+
+    static constexpr ck::index_t kNDimSpatial = NDimSpatial;
+
+    using InLayout  = InLayout_;
+    using WeiLayout = WeiLayout_;
+    using OutLayout = OutLayout_;
+
+    using InDataType  = InDataType_;
+    using WeiDataType = WeiDataType_;
+    using OutDataType = OutDataType_;
+    using AccDataType = AccDataType_;
+
+    using InElementwiseOperation  = InElementwiseOperation_;
+    using WeiElementwiseOperation = WeiElementwiseOperation_;
+    using OutElementwiseOperation = OutElementwiseOperation_;
+
+    static constexpr auto kConvBackwardWeightSpecialization = ConvBackwardWeightSpecialization;
+
+    static constexpr ck::index_t kBlockSize   = BlockSize;
+    static constexpr ck::index_t kMPerBlock   = MPerBlock;
+    static constexpr ck::index_t kNPerBlock   = NPerBlock;
+    static constexpr ck::index_t kK0PerBlock  = K0PerBlock;
+    static constexpr ck::index_t kK1          = K1;
+    static constexpr ck::index_t kMPerXDL     = MPerXDL;
+    static constexpr ck::index_t kNPerXDL     = NPerXDL;
+    static constexpr ck::index_t kMXdlPerWave = MXdlPerWave;
+    static constexpr ck::index_t kNXdlPerWave = NXdlPerWave;
+
+    using ABlockTransferThreadClusterLengths_K0_M_K1 = ABlockTransferThreadClusterLengths_K0_M_K1_;
+    using ABlockTransferThreadClusterArrangeOrder    = ABlockTransferThreadClusterArrangeOrder_;
+    using ABlockTransferSrcAccessOrder               = ABlockTransferSrcAccessOrder_;
+    static constexpr ck::index_t kABlockTransferSrcVectorDim = ABlockTransferSrcVectorDim;
+    static constexpr ck::index_t kABlockTransferSrcScalarPerVector =
+        ABlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t kABlockTransferDstScalarPerVector_K1 =
+        ABlockTransferDstScalarPerVector_K1;
+    static constexpr bool kABlockLdsAddExtraM = ABlockLdsAddExtraM;
+
+    using BBlockTransferThreadClusterLengths_K0_N_K1 = BBlockTransferThreadClusterLengths_K0_N_K1_;
+    using BBlockTransferThreadClusterArrangeOrder    = BBlockTransferThreadClusterArrangeOrder_;
+    using BBlockTransferSrcAccessOrder               = BBlockTransferSrcAccessOrder_;
+    static constexpr ck::index_t kBBlockTransferSrcVectorDim = BBlockTransferSrcVectorDim;
+    static constexpr ck::index_t kBBlockTransferSrcScalarPerVector =
+        BBlockTransferSrcScalarPerVector;
+    static constexpr ck::index_t kBBlockTransferDstScalarPerVector_K1 =
+        BBlockTransferDstScalarPerVector_K1;
+    static constexpr bool kBBlockLdsAddExtraN = BBlockLdsAddExtraN;
+
+    static constexpr ck::index_t kCShuffleMXdlPerWavePerShuffle = CShuffleMXdlPerWavePerShuffle;
+    static constexpr ck::index_t kCShuffleNXdlPerWavePerShuffle = CShuffleNXdlPerWavePerShuffle;
+
+    using CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock =
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock_;
+    static constexpr ck::index_t kCBlockTransferScalarPerVector_NWaveNPerXdl =
+        CBlockTransferScalarPerVector_NWaveNPerXdl;
+
+    using ComputeTypeA = ComputeTypeA_;
+    using ComputeTypeB = ComputeTypeB_;
+
+    static constexpr ck::index_t kMaxTransposeTransferSrcScalarPerVector =
+        MaxTransposeTransferSrcScalarPerVector;
+    static constexpr ck::index_t kMaxTransposeTransferDstScalarPerVector =
+        MaxTransposeTransferDstScalarPerVector;
+
+    // Static member function to generate instance string
+    static std::string instance_string()
+    {
+        std::ostringstream oss;
+
+        // Kernel type name
+        oss << "DeviceGroupedConvBwdWeight_Xdl_CShuffle";
+
+        // Template parameters in exact order
+        oss << "<" << kNDimSpatial;                     // 1. NDimSpatial
+        oss << "," << detail::layout_name<InLayout>();  // 2. InLayout
+        oss << "," << detail::layout_name<WeiLayout>(); // 3. WeiLayout
+        oss << "," << detail::layout_name<OutLayout>(); // 4. OutLayout
+        oss << "," << detail::type_name<InDataType>();  // 5. InDataType
+        oss << "," << detail::type_name<WeiDataType>(); // 6. WeiDataType
+        oss << "," << detail::type_name<OutDataType>(); // 7. OutDataType
+        oss << "," << detail::type_name<AccDataType>(); // 8. AccDataType
+        oss << ","
+            << detail::elementwise_op_name<InElementwiseOperation>(); // 9. InElementwiseOperation
+        oss << ","
+            << detail::elementwise_op_name<WeiElementwiseOperation>(); // 10.
+                                                                       // WeiElementwiseOperation
+        oss << ","
+            << detail::elementwise_op_name<OutElementwiseOperation>(); // 11.
+                                                                       // OutElementwiseOperation
+        oss << ","
+            << detail::conv_bwd_weight_spec_name(
+                   kConvBackwardWeightSpecialization); // 12. ConvBackwardWeightSpecialization
+        oss << "," << kBlockSize;                      // 13. BlockSize
+        oss << "," << kMPerBlock;                      // 14. MPerBlock
+        oss << "," << kNPerBlock;                      // 15. NPerBlock
+        oss << "," << kK0PerBlock;                     // 16. K0PerBlock
+        oss << "," << kK1;                             // 17. K1
+        oss << "," << kMPerXDL;                        // 18. MPerXDL
+        oss << "," << kNPerXDL;                        // 19. NPerXDL
+        oss << "," << kMXdlPerWave;                    // 20. MXdlPerWave
+        oss << "," << kNXdlPerWave;                    // 21. NXdlPerWave
+        oss << "," << detail::sequence_name<ABlockTransferThreadClusterLengths_K0_M_K1>(); // 22.
+        oss << "," << detail::sequence_name<ABlockTransferThreadClusterArrangeOrder>();    // 23.
+        oss << "," << detail::sequence_name<ABlockTransferSrcAccessOrder>();               // 24.
+        oss << "," << kABlockTransferSrcVectorDim;                                         // 25.
+        oss << "," << kABlockTransferSrcScalarPerVector;                                   // 26.
+        oss << "," << kABlockTransferDstScalarPerVector_K1;                                // 27.
+        oss << "," << (kABlockLdsAddExtraM ? "true" : "false");                            // 28.
+        oss << "," << detail::sequence_name<BBlockTransferThreadClusterLengths_K0_N_K1>(); // 29.
+        oss << "," << detail::sequence_name<BBlockTransferThreadClusterArrangeOrder>();    // 30.
+        oss << "," << detail::sequence_name<BBlockTransferSrcAccessOrder>();               // 31.
+        oss << "," << kBBlockTransferSrcVectorDim;                                         // 32.
+        oss << "," << kBBlockTransferSrcScalarPerVector;                                   // 33.
+        oss << "," << kBBlockTransferDstScalarPerVector_K1;                                // 34.
+        oss << "," << (kBBlockLdsAddExtraN ? "true" : "false");                            // 35.
+        oss << "," << kCShuffleMXdlPerWavePerShuffle;                                      // 36.
+        oss << "," << kCShuffleNXdlPerWavePerShuffle;                                      // 37.
+        oss << ","
+            << detail::sequence_name<
+                   CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock>(); // 38.
+        oss << "," << kCBlockTransferScalarPerVector_NWaveNPerXdl;                    // 39.
+        oss << "," << detail::type_name<ComputeTypeA>();                              // 40.
+        oss << "," << detail::type_name<ComputeTypeB>();                              // 41.
+        oss << "," << kMaxTransposeTransferSrcScalarPerVector;                        // 42.
+        oss << "," << kMaxTransposeTransferDstScalarPerVector;                        // 43.
+        oss << ">";
+
+        return oss.str();
+    }
+};
+
+} // namespace reflect
+} // namespace ck_tile
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
index 54ba224f7f..8756825c3f 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 // InstanceTraits specialization for DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
 //
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
index 346b731c33..572b04e75b 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 // InstanceTraits specialization for DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
 //
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
index 5784938fc6..d6fc6da0d6 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 // InstanceTraits specialization for DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
 //
@@ -15,6 +15,7 @@
 #pragma once
 
 #include "instance_traits.hpp"
+#include "instance_traits_util.hpp"
 
 // Forward declaration to avoid circular dependency.
 // This file will be included by the device implementation header, so we cannot include
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
index 9f64c7d6e8..9edfa4d4c9 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 // InstanceTraits specialization for DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
 //
@@ -14,6 +14,7 @@
 #pragma once
 
 #include "instance_traits.hpp"
+#include "instance_traits_util.hpp"
 
 // Forward declaration to avoid circular dependency.
 // This file will be included by the device implementation header, so we cannot include
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
index b41aed784c..f60d5e7b68 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 // InstanceTraits specialization for DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 //
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp
new file mode 100644
index 0000000000..f364b37ae5
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp
@@ -0,0 +1,140 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+// InstanceTraits specialization for GroupedConvolutionForwardKernel
+//
+// CRITICAL MAINTENANCE NOTE:
+// This InstanceTraits file MUST be kept strictly in sync with the device implementation header:
+//   ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+// "In sync" means that the template parameter order, names, and types in the declaration below
+// MUST EXACTLY MATCH those in the device implementation. If these diverge, you may encounter
+// compilation errors, subtle template instantiation mismatches, or silent runtime bugs that are
+// difficult to diagnose. Always update both files together and review changes carefully.
+
+#pragma once
+
+#include "instance_traits.hpp"
+#include "instance_traits_util.hpp"
+
+// Forward declaration to avoid circular dependency.
+namespace ck_tile::device {
+
+template <typename GroupedConvTraitsType_,
+          typename TilePartitioner_,
+          typename GemmPipeline_,
+          typename EpiloguePipeline_>
+struct GroupedConvolutionForwardKernel;
+
+} // namespace ck_tile::device
+
+namespace ck_tile {
+namespace reflect {
+
+// Specialization for GroupedConvolutionForwardKernel
+template <typename GroupedConvTraitsType_,
+          typename TilePartitioner_,
+          typename GemmPipeline_,
+          typename EpiloguePipeline_>
+struct InstanceTraits<ck_tile::device::GroupedConvolutionForwardKernel<GroupedConvTraitsType_,
+                                                                       TilePartitioner_,
+                                                                       GemmPipeline_,
+                                                                       EpiloguePipeline_>>
+{
+    // CK Tile Conv Traits
+    // Spatial dimension
+    static constexpr int kSpatialDim = GroupedConvTraitsType_::NDimSpatial;
+    // Specialization
+    static constexpr ck_tile::ConvolutionSpecialization ConvSpecialization =
+        GroupedConvTraitsType_::ConvSpecialization;
+    // DataType types
+    using InLayout  = typename GroupedConvTraitsType_::InLayout;
+    using WeiLayout = typename GroupedConvTraitsType_::WeiLayout;
+    using DsLayout  = typename GroupedConvTraitsType_::DsLayout;
+    using OutLayout = typename GroupedConvTraitsType_::OutLayout;
+    // Vector size
+    static constexpr int kVectorSizeA = GroupedConvTraitsType_::VectorSizeA;
+    static constexpr int kVectorSizeB = GroupedConvTraitsType_::VectorSizeB;
+    static constexpr int kVectorSizeC = GroupedConvTraitsType_::VectorSizeC;
+    // Num Groups To Merge
+    static constexpr int kNumGroupsToMerge = GroupedConvTraitsType_::NumGroupsToMerge;
+    // Split image (large tensors)
+    static constexpr bool kEnableSplitImage = GroupedConvTraitsType_::EnableSplitImage;
+
+    // TilePartitioner
+    // Block configuration
+    static constexpr int kMPerBlock = TilePartitioner_::MPerBlock;
+    static constexpr int kNPerBlock = TilePartitioner_::NPerBlock;
+    static constexpr int kKPerBlock = TilePartitioner_::KPerBlock;
+
+    static constexpr int kMWarp = TilePartitioner_::BlockGemmShape::BlockWarps::at(number<0>{});
+    static constexpr int kNWarp = TilePartitioner_::BlockGemmShape::BlockWarps::at(number<1>{});
+    static constexpr int kKWarp = TilePartitioner_::BlockGemmShape::BlockWarps::at(number<2>{});
+
+    static constexpr int kMWarpTile = TilePartitioner_::BlockGemmShape::WarpTile::at(number<0>{});
+    static constexpr int kNWarpTile = TilePartitioner_::BlockGemmShape::WarpTile::at(number<1>{});
+    static constexpr int kKWarpTile = TilePartitioner_::BlockGemmShape::WarpTile::at(number<2>{});
+
+    // Data types
+    using ADataType = typename GemmPipeline_::ADataType;
+    using BDataType = typename GemmPipeline_::BDataType;
+    // Gemm Pipeline
+    using GemmPipeline                                                 = GemmPipeline_;
+    static constexpr ck_tile::GemmPipelineScheduler kPipelineScheduler = GemmPipeline_::Scheduler;
+    static constexpr bool kDoubleSmemBuffer = GemmPipeline_::DoubleSmemBuffer;
+    static constexpr int kNumWaveGroups     = GemmPipeline_::NumWaveGroups;
+
+    // Epilogue Pipeline
+    using AccDataType             = typename EpiloguePipeline_::AccDataType;
+    using EDataType               = typename EpiloguePipeline_::ODataType;
+    using DsDataType              = typename EpiloguePipeline_::DsDataType;
+    using CDEElementwiseOperation = typename EpiloguePipeline_::CDElementwise;
+
+    // Static member function to generate instance string
+    static std::string instance_string()
+    {
+        std::ostringstream oss;
+
+        // Kernel type name
+        oss << "GroupedConvolutionForwardKernel";
+
+        // Template parameters in exact order matching InstanceTraits member order
+        oss << "<" << kSpatialDim; // 1. NDimSpatial
+        oss << ","
+            << ck_tile::getConvSpecializationString(ConvSpecialization);   // 2. ConvSpecialization
+        oss << "," << detail::layout_name<InLayout>();                     // 3. InLayout
+        oss << "," << detail::layout_name<WeiLayout>();                    // 4. WeiLayout
+        oss << "," << detail::tuple_name<DsLayout>();                      // 5. DsLayout
+        oss << "," << detail::layout_name<OutLayout>();                    // 6. OutLayout
+        oss << "," << kVectorSizeA;                                        // 7. VectorSizeA
+        oss << "," << kVectorSizeB;                                        // 8. VectorSizeB
+        oss << "," << kVectorSizeC;                                        // 9. VectorSizeC
+        oss << "," << kNumGroupsToMerge;                                   // 10. NumGroupsToMerge
+        oss << "," << kEnableSplitImage;                                   // 11. EnableSplitImage
+        oss << "," << kMPerBlock;                                          // 12. MPerBlock
+        oss << "," << kNPerBlock;                                          // 13. NPerBlock
+        oss << "," << kKPerBlock;                                          // 14. KPerBlock
+        oss << "," << kMWarp;                                              // 15. MWarp
+        oss << "," << kNWarp;                                              // 16. NWarp
+        oss << "," << kKWarp;                                              // 17. KWarp
+        oss << "," << kMWarpTile;                                          // 18. MWarpTile
+        oss << "," << kNWarpTile;                                          // 19. NWarpTile
+        oss << "," << kKWarpTile;                                          // 20. KWarpTile
+        oss << "," << detail::type_name<ADataType>();                      // 21. ADataType
+        oss << "," << detail::type_name<BDataType>();                      // 22. BDataType
+        oss << "," << GemmPipeline::GetPipelineName();                     // 23. BlkGemmPipelineVer
+        oss << "," << detail::pipeline_scheduler_name(kPipelineScheduler); // 24. BlkGemmPipeSched
+        oss << "," << kDoubleSmemBuffer;                                   // 25. NumWaveGroups
+        oss << "," << kNumWaveGroups;                                      // 26. NumWaveGroups
+        oss << "," << detail::type_name<AccDataType>();                    // 27. AccDataType
+        oss << "," << detail::type_name<EDataType>();                      // 28. EDataType
+        oss << "," << detail::tuple_name<DsDataType>();                    // 29. DsDataType
+        oss << ","
+            << detail::elementwise_op_name<CDEElementwiseOperation>(); // 30.
+                                                                       // CDEElementwiseOperation
+        oss << ">";
+
+        return oss.str();
+    }
+};
+
+} // namespace reflect
+} // namespace ck_tile
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp
index b13675a7b9..2e918c5c2d 100644
--- a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp
+++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 // Utility functions and helpers for instance_traits.hpp
 // Contains helper functions to convert types, enums, and sequences to string representations.
@@ -9,9 +9,14 @@
 
 #include <array>
 #include <string>
+#include <concepts>
 #include <string_view>
 #include <sstream>
 #include <type_traits>
+#include <limits.h>
+#include <cmath>
+#include <ostream>
+#include <iostream>
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/sequence.hpp>
 #include <ck/utility/blkgemmpipe_scheduler.hpp>
@@ -21,7 +26,12 @@
 #include <ck_tile/ops/common/tensor_layout.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
 #include <ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp>
+#include <ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp>
 #include <ck/tensor_operation/gpu/device/gemm_specialization.hpp>
+#include <ck_tile/ops/gemm.hpp>
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
 
 namespace ck_tile::reflect::detail {
 
@@ -32,7 +42,7 @@ namespace impl {
 template <typename T>
 consteval std::string_view type_name_impl()
 {
-    if constexpr(std::is_same_v<T, ck::half_t>)
+    if constexpr(std::is_same_v<T, ck::half_t> || std::is_same_v<T, ck_tile::half_t>)
         return "fp16";
     else if constexpr(std::is_same_v<T, float>)
         return "fp32";
@@ -44,11 +54,11 @@ consteval std::string_view type_name_impl()
         return "s8";
     else if constexpr(std::is_same_v<T, int32_t>)
         return "s32";
-    else if constexpr(std::is_same_v<T, ck::bhalf_t>)
+    else if constexpr(std::is_same_v<T, ck::bhalf_t> || std::is_same_v<T, ck_tile::bf16_t>)
         return "bf16";
-    else if constexpr(std::is_same_v<T, ck::f8_t>)
+    else if constexpr(std::is_same_v<T, ck::f8_t> || std::is_same_v<T, ck_tile::fp8_t>)
         return "fp8";
-    else if constexpr(std::is_same_v<T, ck::bf8_t>)
+    else if constexpr(std::is_same_v<T, ck::bf8_t> || std::is_same_v<T, ck_tile::bf8_t>)
         return "bf8";
     else
         return std::string_view{}; // Return empty for supported types
@@ -112,6 +122,20 @@ conv_fwd_spec_name(ck::tensor_operation::device::ConvolutionForwardSpecializatio
     }
 }
 
+// Convert ConvolutionBackwardWeightSpecialization enum to string
+constexpr std::string_view conv_bwd_weight_spec_name(
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization spec)
+{
+    using enum ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization;
+    switch(spec)
+    {
+    case Default: return "Default";
+    case Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
+    case Filter1x1Pad0: return "Filter1x1Pad0";
+    case OddC: return "OddC";
+    }
+}
+
 // Convert GemmSpecialization enum to string
 constexpr std::string_view gemm_spec_name(ck::tensor_operation::device::GemmSpecialization spec)
 {
@@ -148,6 +172,17 @@ constexpr std::string_view pipeline_scheduler_name(ck::BlockGemmPipelineSchedule
     }
 }
 
+constexpr std::string_view pipeline_scheduler_name(ck_tile::GemmPipelineScheduler sched)
+{
+    using enum ck_tile::GemmPipelineScheduler;
+    switch(sched)
+    {
+    case Default: return "Default";
+    case Intrawave: return "Intrawave";
+    case Interwave: return "Interwave";
+    }
+}
+
 // Convert BlockGemmPipelineVersion enum to string
 constexpr std::string_view pipeline_version_name(ck::BlockGemmPipelineVersion ver)
 {
@@ -186,6 +221,26 @@ constexpr std::string_view loop_scheduler_name(ck::LoopScheduler sched)
     }
 }
 
+// Convert TailNumber enum to string
+constexpr std::string_view tail_number_name(ck_tile::TailNumber tail_num)
+{
+    using enum ck_tile::TailNumber;
+    switch(tail_num)
+    {
+    case Odd: return "Odd";
+    case Even: return "Even";
+    case One: return "One";
+    case Two: return "Two";
+    case Three: return "Three";
+    case Four: return "Four";
+    case Five: return "Five";
+    case Six: return "Six";
+    case Seven: return "Seven";
+    case Empty: return "Empty";
+    case Full: return "Full";
+    }
+}
+
 // Convert std::array to string
 template <typename T, std::size_t N>
 inline std::string array_to_string(const std::array<T, N>& arr)
@@ -336,17 +391,53 @@ constexpr std::string tuple_name()
     }(static_cast<T*>(nullptr));
 }
 
+template <typename T>
+    requires requires { []<typename... Ts>(ck_tile::tuple<Ts...>*) {}(static_cast<T*>(nullptr)); }
+constexpr std::string tuple_name()
+{
+    return []<typename... Ts>(ck_tile::tuple<Ts...>*) constexpr {
+        if constexpr(sizeof...(Ts) == 0)
+        {
+            return std::string("EmptyTuple");
+        }
+        else if constexpr((IsLayoutType<Ts> && ...))
+        {
+            // Lambda wrapper for layout_name
+            auto layout_name_fn = []<typename U>() { return layout_name<U>(); };
+            return detail::build_list_string<decltype(layout_name_fn), Ts...>("tuple",
+                                                                              layout_name_fn);
+        }
+        else if constexpr((IsDataType<Ts> && ...))
+        {
+            // Lambda wrapper for type_name
+            auto type_name_fn = []<typename U>() { return type_name<U>(); };
+            return detail::build_list_string<decltype(type_name_fn), Ts...>("tuple", type_name_fn);
+        }
+        else
+        {
+            static_assert((IsLayoutType<Ts> && ...) || (IsDataType<Ts> && ...),
+                          "tuple elements must be all layouts or all data types, not mixed");
+            return std::string{}; // unreachable
+        }
+    }(static_cast<T*>(nullptr));
+}
+
 // Concept to check if a type is a ck::Tuple
 template <typename T>
 concept IsCkTuple =
     requires { []<typename... Ts>(ck::Tuple<Ts...>*) {}(static_cast<T*>(nullptr)); };
 
+// Concept to check if a type is a ck_tile::tuple
+template <typename T>
+concept IsCkTileTuple =
+    requires { []<typename... Ts>(ck_tile::tuple<Ts...>*) {}(static_cast<T*>(nullptr)); };
+
 // Deduces whether to use tuple_name or type_name
 // Handles both scalar data types and ck::Tuple types
 template <typename T>
 constexpr std::string type_or_type_tuple_name()
 {
-    if constexpr(IsCkTuple<T>)
+    if constexpr(IsCkTuple<T> || IsCkTileTuple<T>)
     {
         return tuple_name<T>();
     }
@@ -356,4 +447,30 @@ constexpr std::string type_or_type_tuple_name()
     }
 }
 
+/// @brief Makes a case insensitive comparison of two string views.
+/// @param a First string view
+/// @param b Second string view
+/// @return Whether two string views a equal case insensitive
+constexpr bool case_insensitive_equal(std::string_view a, std::string_view b)
+{
+    if(a.size() != b.size())
+        return false;
+
+    for(size_t i = 0; i < a.size(); ++i)
+    {
+        char c1 = a[i];
+        char c2 = b[i];
+
+        // Convert to lowercase for comparison
+        if(c1 >= 'A' && c1 <= 'Z')
+            c1 += 32;
+        if(c2 >= 'A' && c2 <= 'Z')
+            c2 += 32;
+
+        if(c1 != c2)
+            return false;
+    }
+    return true;
+}
+
 } // namespace ck_tile::reflect::detail
diff --git a/experimental/builder/include/ck_tile/builder/reflect/tree_formatter.hpp b/experimental/builder/include/ck_tile/builder/reflect/tree_formatter.hpp
new file mode 100644
index 0000000000..6a80a994ee
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/reflect/tree_formatter.hpp
@@ -0,0 +1,106 @@
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace ck_tile::reflect {
+
+// Helper class for formatting hierarchical tree structures with proper indentation
+// and tree-drawing characters (├─, └─, │, etc.)
+//
+// Example Usage:
+//
+//   TreeFormatter f;
+//   f.writeLine(0, "Root");
+//   f.writeLine(1, "Branch 1");
+//   f.writeLine(2, "Item 1a");
+//   f.writeLast(2, "Item 1b");
+//   f.writeLast(1, "Branch 2");
+//   f.writeLast(2, "Item 2a");
+//   std::cout << f.getString() << "\n";
+//
+// Generated Output:
+//
+//   Root
+//   ├─ Branch 1
+//   │  ├─ Item 1a
+//   │  └─ Item 1b
+//   └─ Branch 2
+//      └─ Item 2a
+class TreeFormatter
+{
+    public:
+    TreeFormatter() = default;
+
+    // Write a line at the specified indentation level (branch continues after this)
+    template <typename... Args>
+    void writeLine(int indent_level, Args&&... args)
+    {
+        writeLineImpl(indent_level, false, std::forward<Args>(args)...);
+    }
+
+    // Write the last line at the specified indentation level (branch ends)
+    template <typename... Args>
+    void writeLast(int indent_level, Args&&... args)
+    {
+        writeLineImpl(indent_level, true, std::forward<Args>(args)...);
+    }
+
+    // Get the formatted string (removes trailing newline if present)
+    std::string getString() const
+    {
+        std::string result = oss_.str();
+        if(!result.empty() && result.back() == '\n')
+        {
+            result.pop_back();
+        }
+        return result;
+    }
+
+    private:
+    std::ostringstream oss_;
+    std::vector<bool> is_last_at_level_; // Tracks which levels have ended
+
+    // Implementation of line writing with tree symbols
+    template <typename... Args>
+    void writeLineImpl(int indent_level, bool is_last, Args&&... args)
+    {
+        // Ensure we have enough tracking space
+        if(static_cast<size_t>(indent_level) >= is_last_at_level_.size())
+        {
+            is_last_at_level_.resize(indent_level + 1, false);
+            // Level 0 (root) should always be treated as "last" since it has no tree symbols
+            if(is_last_at_level_.size() > 0)
+            {
+                is_last_at_level_[0] = true;
+            }
+        }
+
+        // Draw the tree structure
+        // Start from level 1 (skip level 0 which is the root with no symbols)
+        for(int i = 1; i < indent_level; ++i)
+        {
+            // For all parent levels, draw vertical line or space based on whether they ended
+            oss_ << (is_last_at_level_[i] ? "   " : "│  ");
+        }
+
+        // Draw the branch symbol for the current level
+        if(indent_level > 0)
+        {
+            oss_ << (is_last ? "└─ " : "├─ ");
+        }
+
+        // Write the content using fold expression with direct stream insertion
+        ((oss_ << std::forward<Args>(args)), ...);
+
+        oss_ << '\n';
+
+        // Update tracking for this level AFTER writing the line
+        // This ensures future lines at deeper levels know if this level ended
+        is_last_at_level_[indent_level] = is_last;
+    }
+};
+
+} // namespace ck_tile::reflect
diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp
index a2ef89da2e..a58c994288 100644
--- a/experimental/builder/include/ck_tile/builder/types.hpp
+++ b/experimental/builder/include/ck_tile/builder/types.hpp
@@ -1,8 +1,12 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
+#include <ostream>
+#include <string_view>
+#include <variant>
+
 namespace ck_tile::builder {
 
 enum class DataType
@@ -128,29 +132,14 @@ enum class ElementwiseOperation
     PASS_THROUGH
 };
 
-// Enums for the current block GEMM pipeline versions.
-enum class BlockGemmPipelineVersion
+// Enums for pipeline versions & schedulers
+enum class PipelineVersion
 {
     V1,
     V2,
     V3,
     V4,
-    V5
-};
-
-enum struct BlockGemmPipelineScheduler
-{
-    INTRAWAVE,
-    INTERWAVE,
-};
-
-// Enums for the gridwise GEMM pipeline versions.
-enum class GridwiseGemmPipelineVersion
-{
-    V1,
-    V2,
-    V3, // Only used in stream-K implementation
-    V4,
+    V5,
     WEIGHT_ONLY
 };
 
@@ -186,10 +175,319 @@ enum class ConvFwdSpecialization
     FILTER_3x3
 };
 
-enum class LoopScheduler
+// Enums for the backward data convolution specialization.
+enum class ConvBwdDataSpecialization
 {
     DEFAULT,
+    FILTER_1X1_STRIDE1_PAD0,
+};
+
+// Enums for the backward weight convolution specialization.
+enum class ConvBwdWeightSpecialization
+{
+    DEFAULT,
+    FILTER_1X1_STRIDE1_PAD0,
+    FILTER_1X1_PAD0,
+    ODD_C,
+};
+
+// Enums for the Gemm padding.
+enum class GemmPadding
+{
+    DEFAULT,
+    M_PADDING,
+    N_PADDING,
+    K_PADDING,
+    MN_PADDING,
+    MK_PADDING,
+    NK_PADDING,
+    MNK_PADDING,
+    O_PADDING,
+    MO_PADDING,
+    NO_PADDING,
+    KO_PADDING,
+    MNO_PADDING,
+    MKO_PADDING,
+    NKO_PADDING,
+    MNKO_PADDING,
+};
+
+enum class PipelineScheduler
+{
+    DEFAULT,
+    INTRAWAVE,
     INTERWAVE
 };
 
+// ostream operator overloads for enum classes
+inline std::ostream& operator<<(std::ostream& os, DataType dt)
+{
+    using enum DataType;
+    switch(dt)
+    {
+    case FP16: return os << "FP16";
+    case FP32: return os << "FP32";
+    case BF16: return os << "BF16";
+    case FP8: return os << "FP8";
+    case I8: return os << "I8";
+    case U8: return os << "U8";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, ConvDirection dir)
+{
+    using enum ConvDirection;
+    switch(dir)
+    {
+    case FORWARD: return os << "Forward";
+    case BACKWARD_DATA: return os << "Backward Data";
+    case BACKWARD_WEIGHT: return os << "Backward Weight";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, GroupConvLayout1D layout)
+{
+    using enum GroupConvLayout1D;
+    switch(layout)
+    {
+    case GNWC_GKXC_GNWK: return os << "GNWC_GKXC_GNWK";
+    case NWGC_GKXC_NWGK: return os << "NWGC_GKXC_NWGK";
+    case NGCW_GKXC_NGKW: return os << "NGCW_GKXC_NGKW";
+    case NGCW_GKCX_NGKW: return os << "NGCW_GKCX_NGKW";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, GroupConvLayout2D layout)
+{
+    using enum GroupConvLayout2D;
+    switch(layout)
+    {
+    case GNHWC_GKYXC_GNHWK: return os << "GNHWC_GKYXC_GNHWK";
+    case NHWGC_GKYXC_NHWGK: return os << "NHWGC_GKYXC_NHWGK";
+    case NGCHW_GKYXC_NGKHW: return os << "NGCHW_GKYXC_NGKHW";
+    case NGCHW_GKCYX_NGKHW: return os << "NGCHW_GKCYX_NGKHW";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, GroupConvLayout3D layout)
+{
+    using enum GroupConvLayout3D;
+    switch(layout)
+    {
+    case GNDHWC_GKZYXC_GNDHWK: return os << "GNDHWC_GKZYXC_GNDHWK";
+    case NDHWGC_GKZYXC_NDHWGK: return os << "NDHWGC_GKZYXC_NDHWGK";
+    case NGCDHW_GKZYXC_NGKDHW: return os << "NGCDHW_GKZYXC_NGKDHW";
+    case NGCDHW_GKCZYX_NGKDHW: return os << "NGCDHW_GKCZYX_NGKDHW";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, FwdGroupConvDeviceOperation op)
+{
+    using enum FwdGroupConvDeviceOperation;
+    switch(op)
+    {
+    case DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK:
+        return os << "DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK";
+    case DeviceGroupedConvFwdMultipleD_Wmma_CShuffle:
+        return os << "DeviceGroupedConvFwdMultipleD_Wmma_CShuffle";
+    case DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle:
+        return os << "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle";
+    case DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3:
+        return os << "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3";
+    case DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor:
+        return os << "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, BwdDataGroupConvDeviceOperation op)
+{
+    using enum BwdDataGroupConvDeviceOperation;
+    switch(op)
+    {
+    case DeviceGroupedConvBwdDataMultipleD: return os << "DeviceGroupedConvBwdDataMultipleD";
+    case DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle:
+        return os << "DeviceGroupedConvBwdDataMultipleD_Wmma_CShuffle";
+    case DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1:
+        return os << "DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, BwdWeightGroupConvDeviceOperation op)
+{
+    using enum BwdWeightGroupConvDeviceOperation;
+    switch(op)
+    {
+    case DeviceGroupedConvBwdWeight: return os << "DeviceGroupedConvBwdWeight";
+    case DeviceGroupedConvBwdWeight_Dl: return os << "DeviceGroupedConvBwdWeight_Dl";
+    case DeviceGroupedConvBwdWeight_Xdl_CShuffle:
+        return os << "DeviceGroupedConvBwdWeight_Xdl_CShuffle";
+    case DeviceGroupedConvBwdWeight_Xdl_CShuffleV3:
+        return os << "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3";
+    case DeviceGroupedConvBwdWeight_Wmma_CShuffle:
+        return os << "DeviceGroupedConvBwdWeight_Wmma_CShuffle";
+    case DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle:
+        return os << "DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle";
+    case DeviceGroupedConvBwdWeightMultipleD: return os << "DeviceGroupedConvBwdWeightMultipleD";
+    case DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle:
+        return os << "DeviceGroupedConvBwdWeightMultipleD_Xdl_CShuffle";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, ElementwiseOperation op)
+{
+    using enum ElementwiseOperation;
+    switch(op)
+    {
+    case BIAS: return os << "BIAS";
+    case BIAS_CLAMP: return os << "BIAS_CLAMP";
+    case BIAS_BNORM_CLAMP: return os << "BIAS_BNORM_CLAMP";
+    case BILINEAR: return os << "BILINEAR";
+    case CLAMP: return os << "CLAMP";
+    case SCALE: return os << "SCALE";
+    case PASS_THROUGH: return os << "PASS_THROUGH";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, PipelineVersion ver)
+{
+    using enum PipelineVersion;
+    switch(ver)
+    {
+    case V1: return os << "V1";
+    case V2: return os << "V2";
+    case V3: return os << "V3";
+    case V4: return os << "V4";
+    case V5: return os << "V5";
+    case WEIGHT_ONLY: return os << "WEIGHT_ONLY";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, GemmSpecialization spec)
+{
+    using enum GemmSpecialization;
+    switch(spec)
+    {
+    case Default: return os << "Default";
+    case MPadding: return os << "MPadding";
+    case NPadding: return os << "NPadding";
+    case KPadding: return os << "KPadding";
+    case MNPadding: return os << "MNPadding";
+    case MKPadding: return os << "MKPadding";
+    case NKPadding: return os << "NKPadding";
+    case MNKPadding: return os << "MNKPadding";
+    case OPadding: return os << "OPadding";
+    case MOPadding: return os << "MOPadding";
+    case NOPadding: return os << "NOPadding";
+    case KOPadding: return os << "KOPadding";
+    case MNOPadding: return os << "MNOPadding";
+    case MKOPadding: return os << "MKOPadding";
+    case NKOPadding: return os << "NKOPadding";
+    case MNKOPadding: return os << "MNKOPadding";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, ConvFwdSpecialization spec)
+{
+    using enum ConvFwdSpecialization;
+    switch(spec)
+    {
+    case DEFAULT: return os << "DEFAULT";
+    case FILTER_1X1_PAD0: return os << "FILTER_1X1_PAD0";
+    case FILTER_1X1_STRIDE1_PAD0: return os << "FILTER_1X1_STRIDE1_PAD0";
+    case FILTER_3x3: return os << "FILTER_3x3";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, ConvBwdDataSpecialization spec)
+{
+    using enum ConvBwdDataSpecialization;
+    switch(spec)
+    {
+    case DEFAULT: return os << "DEFAULT";
+    case FILTER_1X1_STRIDE1_PAD0: return os << "FILTER_1X1_STRIDE1_PAD0";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, ConvBwdWeightSpecialization spec)
+{
+    using enum ConvBwdWeightSpecialization;
+    switch(spec)
+    {
+    case DEFAULT: return os << "DEFAULT";
+    case FILTER_1X1_STRIDE1_PAD0: return os << "FILTER_1X1_STRIDE1_PAD0";
+    case FILTER_1X1_PAD0: return os << "FILTER_1X1_PAD0";
+    case ODD_C: return os << "ODD_C";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, GemmPadding padding)
+{
+    using enum GemmPadding;
+    switch(padding)
+    {
+    case DEFAULT: return os << "DEFAULT";
+    case M_PADDING: return os << "M_PADDING";
+    case N_PADDING: return os << "N_PADDING";
+    case K_PADDING: return os << "K_PADDING";
+    case MN_PADDING: return os << "MN_PADDING";
+    case MK_PADDING: return os << "MK_PADDING";
+    case NK_PADDING: return os << "NK_PADDING";
+    case MNK_PADDING: return os << "MNK_PADDING";
+    case O_PADDING: return os << "O_PADDING";
+    case MO_PADDING: return os << "MO_PADDING";
+    case NO_PADDING: return os << "NO_PADDING";
+    case KO_PADDING: return os << "KO_PADDING";
+    case MNO_PADDING: return os << "MNO_PADDING";
+    case MKO_PADDING: return os << "MKO_PADDING";
+    case NKO_PADDING: return os << "NKO_PADDING";
+    case MNKO_PADDING: return os << "MNKO_PADDING";
+    default: return os << "Unknown";
+    }
+}
+
+inline std::ostream& operator<<(std::ostream& os, PipelineScheduler sched)
+{
+    using enum PipelineScheduler;
+    switch(sched)
+    {
+    case DEFAULT: return os << "DEFAULT";
+    case INTRAWAVE: return os << "INTRAWAVE";
+    case INTERWAVE: return os << "INTERWAVE";
+    default: return os << "Unknown";
+    }
+}
+
+// ostream operator overload for std::variant of layout types
+inline std::ostream&
+operator<<(std::ostream& os,
+           const std::variant<GroupConvLayout1D, GroupConvLayout2D, GroupConvLayout3D>& layout)
+{
+    std::visit([&os](const auto& l) { os << l; }, layout);
+    return os;
+}
+
+// ostream operator overload for std::variant of convolution specializations
+inline std::ostream& operator<<(std::ostream& os,
+                                const std::variant<ConvFwdSpecialization,
+                                                   ConvBwdDataSpecialization,
+                                                   ConvBwdWeightSpecialization>& spec)
+{
+    std::visit([&os](const auto& s) { os << s; }, spec);
+    return os;
+}
+
 } // namespace ck_tile::builder
diff --git a/experimental/builder/include/ck_tile/builder/versions.hpp b/experimental/builder/include/ck_tile/builder/versions.hpp
index e8fb2fe4de..4ee45b730b 100644
--- a/experimental/builder/include/ck_tile/builder/versions.hpp
+++ b/experimental/builder/include/ck_tile/builder/versions.hpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #pragma once
 
 #include <concepts>
diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt
index 26a666a805..43c4fd4857 100644
--- a/experimental/builder/test/CMakeLists.txt
+++ b/experimental/builder/test/CMakeLists.txt
@@ -20,6 +20,7 @@ endfunction()
 add_ck_builder_test(test_ckb_conv_builder
     test_conv_builder.cpp
     test_fwd_instance_traits.cpp
+    test_bwd_weight_instance_traits.cpp
     test_instance_traits_util.cpp)
 
 add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp)
@@ -30,7 +31,8 @@ add_ck_builder_test(test_ckb_get_instance_string
     test_get_instance_string_fwd_grp_conv.cpp
     test_get_instance_string_fwd_grp_conv_large_tensor.cpp
     test_get_instance_string_fwd_grp_conv_wmma.cpp
-    test_get_instance_string_fwd_grp_conv_dl.cpp)
+    test_get_instance_string_fwd_grp_conv_dl.cpp
+    test_get_instance_string_bwd_weight_grp_conv_xdl.cpp)
 
 # Testing the fwd convolution builder requires kernel compilation.
 # To enable parallel compilation, the individual tests are split into separate files.
@@ -41,6 +43,8 @@ add_ck_builder_test(test_ckb_build_fwd_instances
     conv/test_ckb_conv_fwd_2d_bf16.cpp
     conv/test_ckb_conv_fwd_2d_fp16.cpp
     conv/test_ckb_conv_fwd_2d_fp32.cpp
+    conv/test_ckb_conv_fwd_2d_dl_fp16.cpp
+    conv/test_ckb_conv_fwd_2d_large_tensor_fp16.cpp
     conv/test_ckb_conv_fwd_3d_bf16.cpp
     conv/test_ckb_conv_fwd_3d_fp16.cpp
     conv/test_ckb_conv_fwd_3d_fp32.cpp)
@@ -62,6 +66,12 @@ add_ck_factory_test(test_ckb_factory_grouped_convolution_forward_bias_bnorm_clam
 add_ck_factory_test(test_ckb_factory_grouped_convolution_forward_scaleadd_scaleadd_relu test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp)
 add_ck_factory_test(test_ckb_factory_grouped_convolution_forward_dynamic_op test_ck_factory_grouped_convolution_forward_dynamic_op.cpp)
 
+add_ck_builder_test(test_conv_traits
+    conv/test_conv_traits.cpp)
+
+add_ck_builder_test(test_conv_description
+    test_conv_description.cpp)
+
 # Function to add all test_ckb targets to a list
 function(collect_test_ckb_targets result_var)
     # Get all targets in current directory
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_1d_bf16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_1d_bf16.cpp
index 472c43438d..123034eb77 100644
--- a/experimental/builder/test/conv/test_ckb_conv_fwd_1d_bf16.cpp
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_1d_bf16.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "utils/ckb_conv_test_common.hpp"
 
 using namespace ck_tile::builder::test_utils;
@@ -24,7 +27,7 @@ TEST(FwdConvInstances,
     run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<
         FwdConvSignature,
         FwdThreadBlock,
-        BlockGemmPipelineVersion::V2,
+        PipelineVersion::V2,
         ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0>();
 }
 
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_1d_fp16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_1d_fp16.cpp
index 3f840ba2b0..a83ca84297 100644
--- a/experimental/builder/test/conv/test_ckb_conv_fwd_1d_fp16.cpp
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_1d_fp16.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "utils/ckb_conv_test_common.hpp"
 
 using namespace ck_tile::builder::test_utils;
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_1d_i8.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_1d_i8.cpp
index 1819cca728..3ceac2a047 100644
--- a/experimental/builder/test/conv/test_ckb_conv_fwd_1d_i8.cpp
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_1d_i8.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "utils/ckb_conv_test_common.hpp"
 
 using namespace ck_tile::builder::test_utils;
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp
index b9969f7e95..240746f546 100644
--- a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_bf16.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "utils/ckb_conv_test_common.hpp"
 
 using namespace ck_tile::builder::test_utils;
@@ -22,7 +25,7 @@ TEST(FwdConvInstances,
 
     run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<FwdConvSignature,
                                                              FwdThreadBlock,
-                                                             BlockGemmPipelineVersion::V1,
+                                                             PipelineVersion::V1,
                                                              ConvFwdSpecialization::DEFAULT>();
 }
 
@@ -44,7 +47,7 @@ TEST(FwdConvInstances,
 
     run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<FwdConvSignature,
                                                              FwdThreadBlock,
-                                                             BlockGemmPipelineVersion::V5,
+                                                             PipelineVersion::V5,
                                                              ConvFwdSpecialization::FILTER_3x3>();
 }
 
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_dl_fp16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_dl_fp16.cpp
new file mode 100644
index 0000000000..12730bab19
--- /dev/null
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_dl_fp16.cpp
@@ -0,0 +1,69 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "utils/ckb_conv_test_common.hpp"
+
+using namespace ck_tile::builder::test_utils;
+
+namespace ck_tile::builder::testing {
+
+TEST(FwdConvInstances, Create_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK_Instance_2D_FP16_GNHWC)
+{
+    constexpr ConvSignature FwdConvSignature{
+        .spatial_dim           = 2,
+        .direction             = ConvDirection::FORWARD,
+        .layout                = GroupConvLayout2D::GNHWC_GKYXC_GNHWK,
+        .data_type             = DataType::FP16,
+        .elementwise_operation = ElementwiseOperation::PASS_THROUGH,
+        .device_operation =
+            FwdGroupConvDeviceOperation::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK};
+
+    constexpr ThreadBlock FwdThreadBlock{.block_size = 256,
+                                         .tile_size  = {.m = 128, .n = 128, .k = 16}};
+
+    run_test_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<FwdConvSignature,
+                                                            FwdThreadBlock,
+                                                            ConvFwdSpecialization::DEFAULT>();
+}
+
+TEST(FwdConvInstances, Create_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK_Instance_2D_FP16_NHWGC)
+{
+    constexpr ConvSignature FwdConvSignature{
+        .spatial_dim           = 2,
+        .direction             = ConvDirection::FORWARD,
+        .layout                = GroupConvLayout2D::NHWGC_GKYXC_NHWGK,
+        .data_type             = DataType::FP16,
+        .elementwise_operation = ElementwiseOperation::PASS_THROUGH,
+        .device_operation =
+            FwdGroupConvDeviceOperation::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK};
+
+    constexpr ThreadBlock FwdThreadBlock{.block_size = 256,
+                                         .tile_size  = {.m = 128, .n = 128, .k = 16}};
+
+    run_test_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<FwdConvSignature,
+                                                            FwdThreadBlock,
+                                                            ConvFwdSpecialization::DEFAULT>();
+}
+
+TEST(FwdConvInstances,
+     Create_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK_Instance_2D_FP16_FILTER_1X1_PAD0)
+{
+    constexpr ConvSignature FwdConvSignature{
+        .spatial_dim           = 2,
+        .direction             = ConvDirection::FORWARD,
+        .layout                = GroupConvLayout2D::GNHWC_GKYXC_GNHWK,
+        .data_type             = DataType::FP16,
+        .elementwise_operation = ElementwiseOperation::PASS_THROUGH,
+        .device_operation =
+            FwdGroupConvDeviceOperation::DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK};
+
+    constexpr ThreadBlock FwdThreadBlock{.block_size = 256,
+                                         .tile_size  = {.m = 128, .n = 128, .k = 16}};
+
+    run_test_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK<
+        FwdConvSignature,
+        FwdThreadBlock,
+        ConvFwdSpecialization::FILTER_1X1_PAD0>();
+}
+
+} // namespace ck_tile::builder::testing
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp
index cd5186cc10..6366016707 100644
--- a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp16.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "utils/ckb_conv_test_common.hpp"
 
 using namespace ck_tile::builder::test_utils;
@@ -22,7 +25,7 @@ TEST(FwdConvInstances,
     run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<
         FwdConvSignature,
         FwdThreadBlock,
-        BlockGemmPipelineVersion::V3,
+        PipelineVersion::V3,
         ConvFwdSpecialization::FILTER_1X1_PAD0>();
 }
 
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp
index 584e0ab182..7b303a7bde 100644
--- a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_fp32.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "utils/ckb_conv_test_common.hpp"
 
 using namespace ck_tile::builder::test_utils;
@@ -22,7 +25,7 @@ TEST(FwdConvInstances,
     run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<
         FwdConvSignature,
         FwdThreadBlock,
-        BlockGemmPipelineVersion::V4,
+        PipelineVersion::V4,
         ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0>();
 }
 
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_2d_large_tensor_fp16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_large_tensor_fp16.cpp
new file mode 100644
index 0000000000..0216c5907d
--- /dev/null
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_2d_large_tensor_fp16.cpp
@@ -0,0 +1,53 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include "utils/ckb_conv_test_common.hpp"
+
+using namespace ck_tile::builder::test_utils;
+
+namespace ck_tile::builder::testing {
+
+TEST(FwdConvInstances,
+     Create_DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor_Instance_2D_FP16_GNHWC)
+{
+    constexpr ConvSignature FwdConvSignature{
+        .spatial_dim           = 2,
+        .direction             = ConvDirection::FORWARD,
+        .layout                = GroupConvLayout2D::GNHWC_GKYXC_GNHWK,
+        .data_type             = DataType::FP16,
+        .elementwise_operation = ElementwiseOperation::PASS_THROUGH,
+        .device_operation =
+            FwdGroupConvDeviceOperation::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor};
+
+    constexpr ThreadBlock FwdThreadBlock{.block_size = 256,
+                                         .tile_size  = {.m = 256, .n = 128, .k = 32}};
+
+    run_test_DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<
+        FwdConvSignature,
+        FwdThreadBlock,
+        ConvFwdSpecialization::DEFAULT>();
+}
+
+TEST(
+    FwdConvInstances,
+    Create_DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor_Instance_2D_FP16_GNHWC_Filter1x1Pad0)
+{
+    constexpr ConvSignature FwdConvSignature{
+        .spatial_dim           = 2,
+        .direction             = ConvDirection::FORWARD,
+        .layout                = GroupConvLayout2D::GNHWC_GKYXC_GNHWK,
+        .data_type             = DataType::FP16,
+        .elementwise_operation = ElementwiseOperation::PASS_THROUGH,
+        .device_operation =
+            FwdGroupConvDeviceOperation::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor};
+
+    constexpr ThreadBlock FwdThreadBlock{.block_size = 128,
+                                         .tile_size  = {.m = 128, .n = 128, .k = 32}};
+
+    run_test_DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<
+        FwdConvSignature,
+        FwdThreadBlock,
+        ConvFwdSpecialization::FILTER_1X1_PAD0>();
+}
+
+} // namespace ck_tile::builder::testing
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp
index 17caf98457..b40dd0b0d7 100644
--- a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_bf16.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "utils/ckb_conv_test_common.hpp"
 
 using namespace ck_tile::builder::test_utils;
@@ -22,7 +25,7 @@ TEST(FwdConvInstances,
 
     run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<FwdConvSignature,
                                                              FwdThreadBlock,
-                                                             BlockGemmPipelineVersion::V3,
+                                                             PipelineVersion::V3,
                                                              ConvFwdSpecialization::DEFAULT>();
 }
 
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp
index ec4649a6ff..e0dad4e1a1 100644
--- a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp16.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "utils/ckb_conv_test_common.hpp"
 
 using namespace ck_tile::builder::test_utils;
@@ -23,7 +26,7 @@ TEST(FwdConvInstances,
     run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<
         FwdConvSignature,
         FwdThreadBlock,
-        BlockGemmPipelineVersion::V4,
+        PipelineVersion::V4,
         ConvFwdSpecialization::FILTER_1X1_PAD0>();
 }
 
diff --git a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp
index 393ea9206d..43ffb3f89a 100644
--- a/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp
+++ b/experimental/builder/test/conv/test_ckb_conv_fwd_3d_fp32.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include "utils/ckb_conv_test_common.hpp"
 
 using namespace ck_tile::builder::test_utils;
@@ -23,7 +26,7 @@ TEST(FwdConvInstances,
     run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<
         FwdConvSignature,
         FwdThreadBlock,
-        BlockGemmPipelineVersion::V1,
+        PipelineVersion::V1,
         ConvFwdSpecialization::FILTER_1X1_PAD0>();
 }
 
diff --git a/experimental/builder/test/conv/test_conv_traits.cpp b/experimental/builder/test/conv/test_conv_traits.cpp
new file mode 100644
index 0000000000..ca453d2ad4
--- /dev/null
+++ b/experimental/builder/test/conv/test_conv_traits.cpp
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <concepts>
+
+#include <ck_tile/builder/reflect/conv_traits.hpp>
+#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp>
+#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp>
+#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp>
+
+namespace {
+
+using ::testing::ElementsAre;
+
+// Test fixture for ConvTraits tests
+class ConvTraitsTest : public ::testing::Test
+{
+};
+
+// Test ConvTraits with DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3
+TEST_F(ConvTraitsTest, ConvFwdTraitsExtraction)
+{
+    // Define a concrete instance type with specific template parameters
+    using DeviceInstance =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<
+            2,                                               // NDimSpatial
+            ck::tensor_layout::convolution::GNHWC,           // ALayout
+            ck::tensor_layout::convolution::GKYXC,           // BLayout
+            ck::Tuple<>,                                     // DsLayout
+            ck::tensor_layout::convolution::GNHWK,           // ELayout
+            ck::half_t,                                      // ADataType
+            ck::half_t,                                      // BDataType
+            float,                                           // AccDataType
+            ck::half_t,                                      // CShuffleDataType
+            ck::Tuple<>,                                     // DsDataType
+            ck::half_t,                                      // EDataType
+            ck::tensor_operation::element_wise::PassThrough, // AElementwiseOperation
+            ck::tensor_operation::element_wise::PassThrough, // BElementwiseOperation
+            ck::tensor_operation::element_wise::PassThrough, // CDEElementwiseOperation
+            ck::tensor_operation::device::ConvolutionForwardSpecialization::
+                Default,                                               // ConvForwardSpecialization
+            ck::tensor_operation::device::GemmSpecialization::Default, // GemmSpec
+            256,                                                       // BlockSize
+            128,                                                       // MPerBlock
+            128,                                                       // NPerBlock
+            16,                                                        // KPerBlock
+            8,                                                         // AK1
+            8,                                                         // BK1
+            32,                                                        // MPerXDL
+            32,                                                        // NPerXDL
+            4,                                                         // MXdlPerWave
+            4,                                                         // NXdlPerWave
+            ck::Sequence<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+            ck::Sequence<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+            ck::Sequence<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+            2,                      // ABlockTransferSrcVectorDim
+            8,                      // ABlockTransferSrcScalarPerVector
+            8,                      // ABlockTransferDstScalarPerVector_AK1
+            1,                      // ABlockLdsExtraM
+            ck::Sequence<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+            ck::Sequence<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+            ck::Sequence<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+            2,                      // BBlockTransferSrcVectorDim
+            8,                      // BBlockTransferSrcScalarPerVector
+            8,                      // BBlockTransferDstScalarPerVector_BK1
+            1,                      // BBlockLdsExtraN
+            1,                      // CShuffleMXdlPerWavePerShuffle
+            1,                      // CShuffleNXdlPerWavePerShuffle
+            ck::Sequence<1,
+                         32,
+                         1,
+                         8>, // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8,               // CDEBlockTransferScalarPerVector_NPerBlock
+            ck::BlockGemmPipelineScheduler::Intrawave, // BlkGemmPipeSched
+            ck::PipelineVersion::v1,                   // BlkGemmPipelineVer
+            ck::half_t,                                // AComputeDataType
+            ck::half_t,                                // BComputeDataType
+            false>;                                    // DirectLoad
+
+    // Use ConvTraits to extract compile-time information
+    using Traits = ck_tile::reflect::conv::ConvTraits<DeviceInstance>;
+
+    // Verify signature information
+    EXPECT_EQ(Traits::spatial_dim, 2);
+    EXPECT_EQ(Traits::direction, ck_tile::builder::ConvDirection::FORWARD);
+    EXPECT_EQ(Traits::layout, ck_tile::builder::GroupConvLayout2D::GNHWC_GKYXC_GNHWK);
+    EXPECT_EQ(Traits::data_type, ck_tile::builder::DataType::FP16);
+    EXPECT_EQ(Traits::input_element_op, ck_tile::builder::ElementwiseOperation::PASS_THROUGH);
+    EXPECT_EQ(Traits::weight_element_op, ck_tile::builder::ElementwiseOperation::PASS_THROUGH);
+    EXPECT_EQ(Traits::output_element_op, ck_tile::builder::ElementwiseOperation::PASS_THROUGH);
+
+    // Verify specializations
+    EXPECT_EQ(Traits::gemm_padding, ck_tile::builder::GemmPadding::DEFAULT);
+    EXPECT_EQ(Traits::conv_specialization, ck_tile::builder::ConvFwdSpecialization::DEFAULT);
+
+    // Verify algorithm information
+    EXPECT_EQ(Traits::thread_block_size, 256);
+
+    // Verify tile dimensions
+    EXPECT_EQ(Traits::tile_dims.m, 128);
+    EXPECT_EQ(Traits::tile_dims.n, 128);
+    EXPECT_EQ(Traits::tile_dims.k, 16);
+
+    // Verify A tile transfer info
+    EXPECT_EQ(Traits::a_tile_transfer.tile_dimensions.k0, 2);
+    EXPECT_EQ(Traits::a_tile_transfer.tile_dimensions.m_or_n, 128);
+    EXPECT_EQ(Traits::a_tile_transfer.tile_dimensions.k1, 8);
+    EXPECT_EQ(Traits::a_tile_transfer.transfer_params.k1, 8);
+    EXPECT_THAT(Traits::a_tile_transfer.transfer_params.thread_cluster_dims, ElementsAre(4, 64, 1));
+    EXPECT_THAT(Traits::a_tile_transfer.transfer_params.thread_cluster_order, ElementsAre(1, 0, 2));
+    EXPECT_THAT(Traits::a_tile_transfer.transfer_params.src_access_order, ElementsAre(1, 0, 2));
+    EXPECT_EQ(Traits::a_tile_transfer.transfer_params.src_vector_dim, 2);
+    EXPECT_EQ(Traits::a_tile_transfer.transfer_params.src_scalar_per_vector, 8);
+    EXPECT_EQ(Traits::a_tile_transfer.transfer_params.dst_scalar_per_vector_k1, 8);
+    EXPECT_TRUE(Traits::a_tile_transfer.transfer_params.lds_padding);
+
+    // Verify B tile transfer info
+    EXPECT_EQ(Traits::b_tile_transfer.tile_dimensions.k0, 2);
+    EXPECT_EQ(Traits::b_tile_transfer.tile_dimensions.m_or_n, 128);
+    EXPECT_EQ(Traits::b_tile_transfer.tile_dimensions.k1, 8);
+    EXPECT_EQ(Traits::b_tile_transfer.transfer_params.k1, 8);
+    EXPECT_THAT(Traits::b_tile_transfer.transfer_params.thread_cluster_dims, ElementsAre(4, 64, 1));
+    EXPECT_THAT(Traits::b_tile_transfer.transfer_params.thread_cluster_order, ElementsAre(1, 0, 2));
+    EXPECT_THAT(Traits::b_tile_transfer.transfer_params.src_access_order, ElementsAre(1, 0, 2));
+    EXPECT_EQ(Traits::b_tile_transfer.transfer_params.src_vector_dim, 2);
+    EXPECT_EQ(Traits::b_tile_transfer.transfer_params.src_scalar_per_vector, 8);
+    EXPECT_EQ(Traits::b_tile_transfer.transfer_params.dst_scalar_per_vector_k1, 8);
+    EXPECT_TRUE(Traits::b_tile_transfer.transfer_params.lds_padding);
+
+    // Verify warp GEMM params
+    EXPECT_EQ(Traits::warp_gemm.gemm_m, 32);
+    EXPECT_EQ(Traits::warp_gemm.gemm_n, 32);
+    EXPECT_EQ(Traits::warp_gemm.m_iter, 4);
+    EXPECT_EQ(Traits::warp_gemm.n_iter, 4);
+
+    // Verify output tile transfer info
+    EXPECT_EQ(Traits::c_tile_transfer.shuffle_params.m_gemms_per_shuffle, 1);
+    EXPECT_EQ(Traits::c_tile_transfer.shuffle_params.n_gemms_per_shuffle, 1);
+    EXPECT_THAT(Traits::c_tile_transfer.thread_cluster_dims, ElementsAre(1, 32, 1, 8));
+    EXPECT_EQ(Traits::c_tile_transfer.scalar_per_vector, 8);
+
+    // Verify pipeline configuration
+    EXPECT_EQ(Traits::pipeline_scheduler, ck_tile::builder::PipelineScheduler::INTRAWAVE);
+    EXPECT_EQ(Traits::pipeline_version, ck_tile::builder::PipelineVersion::V1);
+}
+
+// Test ConvTraits with DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
+TEST_F(ConvTraitsTest, ConvFwdBaseTraitsExtraction)
+{
+    // Define a concrete instance type with specific template parameters
+    using DeviceInstance =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
+            2,                                               // NDimSpatial
+            ck::tensor_layout::convolution::GNHWC,           // ALayout
+            ck::tensor_layout::convolution::GKYXC,           // BLayout
+            ck::Tuple<>,                                     // DsLayout
+            ck::tensor_layout::convolution::GNHWK,           // ELayout
+            ck::half_t,                                      // ADataType
+            ck::half_t,                                      // BDataType
+            float,                                           // AccDataType
+            ck::half_t,                                      // CShuffleDataType
+            ck::Tuple<>,                                     // DsDataType
+            ck::half_t,                                      // EDataType
+            ck::tensor_operation::element_wise::PassThrough, // AElementwiseOperation
+            ck::tensor_operation::element_wise::PassThrough, // BElementwiseOperation
+            ck::tensor_operation::element_wise::PassThrough, // CDEElementwiseOperation
+            ck::tensor_operation::device::ConvolutionForwardSpecialization::
+                Default,                                               // ConvForwardSpecialization
+            ck::tensor_operation::device::GemmSpecialization::Default, // GemmSpec
+            1,                                                         // NumGemmKPrefetchStage
+            256,                                                       // BlockSize
+            128,                                                       // MPerBlock
+            128,                                                       // NPerBlock
+            16,                                                        // KPerBlock
+            8,                                                         // AK1
+            8,                                                         // BK1
+            32,                                                        // MPerXDL
+            32,                                                        // NPerXDL
+            4,                                                         // MXdlPerWave
+            4,                                                         // NXdlPerWave
+            ck::Sequence<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+            ck::Sequence<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+            ck::Sequence<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+            2,                      // ABlockTransferSrcVectorDim
+            8,                      // ABlockTransferSrcScalarPerVector
+            8,                      // ABlockTransferDstScalarPerVector_AK1
+            1,                      // ABlockLdsExtraM
+            ck::Sequence<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+            ck::Sequence<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+            ck::Sequence<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+            2,                      // BBlockTransferSrcVectorDim
+            8,                      // BBlockTransferSrcScalarPerVector
+            8,                      // BBlockTransferDstScalarPerVector_BK1
+            1,                      // BBlockLdsExtraN
+            1,                      // CShuffleMXdlPerWavePerShuffle
+            1,                      // CShuffleNXdlPerWavePerShuffle
+            ck::Sequence<1,
+                         32,
+                         1,
+                         8>, // CDEBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+            8,               // CDEBlockTransferScalarPerVector_NPerBlock
+            ck::half_t,      // AComputeDataType
+            ck::half_t,      // BComputeDataType
+            ck::LoopScheduler::Default, // LoopSched
+            1>;                         // NumGroupsToMerge
+
+    // Use ConvTraits to extract compile-time information
+    using Traits = ck_tile::reflect::conv::ConvTraits<DeviceInstance>;
+
+    // Verify signature information
+    EXPECT_EQ(Traits::spatial_dim, 2);
+    EXPECT_EQ(Traits::direction, ck_tile::builder::ConvDirection::FORWARD);
+    EXPECT_EQ(Traits::layout, ck_tile::builder::GroupConvLayout2D::GNHWC_GKYXC_GNHWK);
+    EXPECT_EQ(Traits::data_type, ck_tile::builder::DataType::FP16);
+    EXPECT_EQ(Traits::input_element_op, ck_tile::builder::ElementwiseOperation::PASS_THROUGH);
+    EXPECT_EQ(Traits::weight_element_op, ck_tile::builder::ElementwiseOperation::PASS_THROUGH);
+    EXPECT_EQ(Traits::output_element_op, ck_tile::builder::ElementwiseOperation::PASS_THROUGH);
+
+    // Verify specializations
+    EXPECT_EQ(Traits::gemm_padding, ck_tile::builder::GemmPadding::DEFAULT);
+    EXPECT_EQ(Traits::conv_specialization, ck_tile::builder::ConvFwdSpecialization::DEFAULT);
+
+    // Verify algorithm information
+    EXPECT_EQ(Traits::thread_block_size, 256);
+
+    // Verify tile dimensions
+    EXPECT_EQ(Traits::tile_dims.m, 128);
+    EXPECT_EQ(Traits::tile_dims.n, 128);
+    EXPECT_EQ(Traits::tile_dims.k, 16);
+}
+// Test ConvTraits with DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
+TEST_F(ConvTraitsTest, ConvFwdLargeTensorTraitsExtraction)
+{
+    // Define a concrete instance type with specific template parameters
+    using DeviceInstance =
+        ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor<
+            2,                                               // NDimSpatial
+            ck::tensor_layout::convolution::GNHWC,           // ALayout
+            ck::tensor_layout::convolution::GKYXC,           // BLayout
+            ck::Tuple<>,                                     // DsLayout
+            ck::tensor_layout::convolution::GNHWK,           // ELayout
+            ck::half_t,                                      // ADataType
+            ck::half_t,                                      // BDataType
+            float,                                           // AccDataType
+            ck::half_t,                                      // CShuffleDataType
+            ck::Tuple<>,                                     // DsDataType
+            ck::half_t,                                      // EDataType
+            ck::tensor_operation::element_wise::PassThrough, // AElementwiseOperation
+            ck::tensor_operation::element_wise::PassThrough, // BElementwiseOperation
+            ck::tensor_operation::element_wise::PassThrough, // CDEElementwiseOperation
+            ck::tensor_operation::device::ConvolutionForwardSpecialization::
+                Default,                                               // ConvForwardSpecialization
+            ck::tensor_operation::device::GemmSpecialization::Default, // GemmSpec
+            1,                                                         // NumGemmKPrefetchStage
+            256,                                                       // BlockSize
+            128,                                                       // MPerBlock
+            128,                                                       // NPerBlock
+            16,                                                        // KPerBlock
+            8,                                                         // AK1
+            8,                                                         // BK1
+            32,                                                        // MPerXDL
+            32,                                                        // NPerXDL
+            4,                                                         // MXdlPerWave
+            4,                                                         // NXdlPerWave
+            ck::Sequence<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
+            ck::Sequence<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+            ck::Sequence<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+            2,                      // ABlockTransferSrcVectorDim
+            8,                      // ABlockTransferSrcScalarPerVector
+            8,                      // ABlockTransferDstScalarPerVector_AK1
+            1,                      // ABlockLdsExtraM
+            ck::Sequence<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
+            ck::Sequence<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+            ck::Sequence<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+            2,                      // BBlockTransferSrcVectorDim
+            8,                      // BBlockTransferSrcScalarPerVector
+            8,                      // BBlockTransferDstScalarPerVector_BK1
+            1,                      // BBlockLdsExtraN
+            1,                      // CShuffleMXdlPerWavePerShuffle
+            1,                      // CShuffleNXdlPerWavePerShuffle
+            ck::Sequence<1,
+                         32,
+                         1,
+                         8>,             // CDEBlockTransferClusterLengths
+            8,                           // CDEBlockTransferScalarPerVector_NPerBlock
+            ck::half_t,                  // AComputeDataType
+            ck::half_t,                  // BComputeDataType
+            ck::LoopScheduler::Default>; // LoopSched
+
+    // Use ConvTraits to extract compile-time information
+    using Traits = ck_tile::reflect::conv::ConvTraits<DeviceInstance>;
+
+    // Verify signature information
+    EXPECT_EQ(Traits::spatial_dim, 2);
+    EXPECT_EQ(Traits::direction, ck_tile::builder::ConvDirection::FORWARD);
+    EXPECT_EQ(Traits::layout, ck_tile::builder::GroupConvLayout2D::GNHWC_GKYXC_GNHWK);
+    EXPECT_EQ(Traits::data_type, ck_tile::builder::DataType::FP16);
+    EXPECT_EQ(Traits::input_element_op, ck_tile::builder::ElementwiseOperation::PASS_THROUGH);
+    EXPECT_EQ(Traits::weight_element_op, ck_tile::builder::ElementwiseOperation::PASS_THROUGH);
+    EXPECT_EQ(Traits::output_element_op, ck_tile::builder::ElementwiseOperation::PASS_THROUGH);
+
+    // Verify specializations
+    EXPECT_EQ(Traits::gemm_padding, ck_tile::builder::GemmPadding::DEFAULT);
+    EXPECT_EQ(Traits::conv_specialization, ck_tile::builder::ConvFwdSpecialization::DEFAULT);
+
+    // Verify algorithm information
+    EXPECT_EQ(Traits::thread_block_size, 256);
+
+    // Verify tile dimensions
+    EXPECT_EQ(Traits::tile_dims.m, 128);
+    EXPECT_EQ(Traits::tile_dims.n, 128);
+    EXPECT_EQ(Traits::tile_dims.k, 16);
+}
+} // anonymous namespace
diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp
index 9c5ca9b97b..88c5b5787a 100644
--- a/experimental/builder/test/impl/conv_algorithm_types.hpp
+++ b/experimental/builder/test/impl/conv_algorithm_types.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -49,14 +49,14 @@ struct GridwiseWmmaGemm
     size_t n_per_wmma      = 0;
     size_t m_wmma_per_wave = 0;
     size_t n_wmma_per_wave = 0;
-    GridwiseGemmPipelineVersion pipeline_version;
+    PipelineVersion pipeline_version;
 };
 static_assert(ckb::GridwiseWmmaGemmDescriptor<GridwiseWmmaGemm>);
 
 struct BlockGemm
 {
-    BlockGemmPipelineVersion pipeline_version;
-    BlockGemmPipelineScheduler scheduler;
+    PipelineVersion pipeline_version;
+    PipelineScheduler scheduler;
 };
 static_assert(ckb::BlockGemmDescriptor<BlockGemm>);
 
@@ -156,7 +156,7 @@ struct ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
     GemmSpecialization gemm_specialization;
     size_t num_gemm_k_prefetch_stages;
     size_t num_groups_to_merge;
-    LoopScheduler loop_scheduler;
+    PipelineScheduler loop_scheduler;
 };
 static_assert(
     ckb::ConvAlgorithmDescriptor<ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle>);
@@ -191,7 +191,7 @@ struct ConvAlgorithm_DeviceGroupedConvFwdMultipleD_Wmma_CShuffle
     ConvFwdSpecialization fwd_specialization;
     GemmSpecialization gemm_specialization;
     size_t num_gemm_k_prefetch_stages;
-    LoopScheduler loop_scheduler;
+    PipelineScheduler loop_scheduler;
 };
 static_assert(
     ckb::ConvAlgorithmDescriptor<ConvAlgorithm_DeviceGroupedConvFwdMultipleD_Wmma_CShuffle>);
@@ -214,4 +214,84 @@ static_assert(
 static_assert(
     ckb::SpecifiesLoopScheduler<ConvAlgorithm_DeviceGroupedConvFwdMultipleD_Wmma_CShuffle>);
 
+// DL-specific descriptors
+struct DlThreadConfig
+{
+    size_t k0_per_block;
+    size_t k1;
+    size_t m1_per_thread;
+    size_t n1_per_thread;
+    size_t k_per_thread;
+};
+static_assert(ckb::DlThreadConfigDescriptor<DlThreadConfig>);
+
+struct DlThreadCluster
+{
+    std::array<size_t, 2> m1_xs; // e.g., {8, 2}
+    std::array<size_t, 2> n1_xs; // e.g., {8, 2}
+};
+static_assert(ckb::DlThreadClusterDescriptor<DlThreadCluster>);
+
+struct DlBlockTransferK0M0M1K1
+{
+    std::array<size_t, 4> thread_slice_lengths;
+    std::array<size_t, 4> thread_cluster_lengths;
+    std::array<size_t, 4> thread_cluster_arrange_order;
+    std::array<size_t, 4> src_access_order;
+    std::array<size_t, 4> src_vector_tensor_lengths;
+    std::array<size_t, 4> src_vector_tensor_contiguous_dim_order;
+    std::array<size_t, 4> dst_vector_tensor_lengths;
+};
+static_assert(ckb::DlBlockTransferK0M0M1K1Descriptor<DlBlockTransferK0M0M1K1>);
+
+struct DlBlockTransferK0N0N1K1
+{
+    std::array<size_t, 4> thread_slice_lengths;
+    std::array<size_t, 4> thread_cluster_lengths;
+    std::array<size_t, 4> thread_cluster_arrange_order;
+    std::array<size_t, 4> src_access_order;
+    std::array<size_t, 4> src_vector_tensor_lengths;
+    std::array<size_t, 4> src_vector_tensor_contiguous_dim_order;
+    std::array<size_t, 4> dst_vector_tensor_lengths;
+};
+static_assert(ckb::DlBlockTransferK0N0N1K1Descriptor<DlBlockTransferK0N0N1K1>);
+
+struct DlCThreadTransfer
+{
+    std::array<size_t, 6> src_dst_access_order;
+    size_t src_dst_vector_dim;
+    size_t dst_scalar_per_vector;
+};
+static_assert(ckb::DlCThreadTransferDescriptor<DlCThreadTransfer>);
+
+struct ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK
+{
+    ThreadBlock thread_block;
+    ConvFwdSpecialization fwd_specialization;
+    GemmSpecialization gemm_specialization;
+    DlThreadConfig dl_thread_config;
+    DlThreadCluster dl_thread_cluster;
+    DlBlockTransferK0M0M1K1 dl_block_transfer_a;
+    DlBlockTransferK0N0N1K1 dl_block_transfer_b;
+    DlCThreadTransfer dl_c_thread_transfer;
+};
+static_assert(
+    ckb::ConvAlgorithmDescriptor<ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK>);
+static_assert(
+    ckb::SpecifiesThreadBlock<ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK>);
+static_assert(ckb::SpecifiesFwdConcSpecialization<
+              ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK>);
+static_assert(
+    ckb::SpecifiesGemmSpecialization<ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK>);
+static_assert(
+    ckb::SpecifiesDlThreadConfig<ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK>);
+static_assert(
+    ckb::SpecifiesDlThreadCluster<ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK>);
+static_assert(
+    ckb::SpecifiesDlBlockTransferA<ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK>);
+static_assert(
+    ckb::SpecifiesDlBlockTransferB<ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK>);
+static_assert(
+    ckb::SpecifiesDlCThreadTransfer<ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK>);
+
 } // namespace ck_tile::builder::test
diff --git a/experimental/builder/test/impl/conv_signature_types.hpp b/experimental/builder/test/impl/conv_signature_types.hpp
index cc5490c711..5e6684c4cd 100644
--- a/experimental/builder/test/impl/conv_signature_types.hpp
+++ b/experimental/builder/test/impl/conv_signature_types.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/experimental/builder/test/test_bwd_weight_instance_traits.cpp b/experimental/builder/test/test_bwd_weight_instance_traits.cpp
new file mode 100644
index 0000000000..24c28c2b9d
--- /dev/null
+++ b/experimental/builder/test/test_bwd_weight_instance_traits.cpp
@@ -0,0 +1,112 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+#include <ck/ck.hpp>
+#include <ck_tile/builder/reflect/instance_traits.hpp>
+#include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp>
+
+namespace {
+
+TEST(InstanceTraits, BwdWeightXdlCShuffleInstanceStringReturnsCorrectFormat)
+{
+    using DeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffle<
+        2,                                               // NDimSpatial
+        ck::tensor_layout::convolution::GNHWC,           // InLayout
+        ck::tensor_layout::convolution::GKYXC,           // WeiLayout
+        ck::tensor_layout::convolution::GNHWK,           // OutLayout
+        ck::half_t,                                      // InDataType
+        ck::half_t,                                      // WeiDataType
+        ck::half_t,                                      // OutDataType
+        float,                                           // AccDataType
+        ck::tensor_operation::element_wise::PassThrough, // InElementwiseOperation
+        ck::tensor_operation::element_wise::PassThrough, // WeiElementwiseOperation
+        ck::tensor_operation::element_wise::PassThrough, // OutElementwiseOperation
+        ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::
+            Default,            // ConvBackwardWeightSpecialization
+        256,                    // BlockSize
+        128,                    // MPerBlock
+        128,                    // NPerBlock
+        4,                      // K0PerBlock
+        8,                      // K1
+        32,                     // MPerXDL
+        32,                     // NPerXDL
+        2,                      // MXdlPerWave
+        2,                      // NXdlPerWave
+        ck::Sequence<4, 64, 1>, // ABlockTransferThreadClusterLengths_K0_M_K1
+        ck::Sequence<1, 0, 2>,  // ABlockTransferThreadClusterArrangeOrder
+        ck::Sequence<1, 0, 2>,  // ABlockTransferSrcAccessOrder
+        2,                      // ABlockTransferSrcVectorDim
+        8,                      // ABlockTransferSrcScalarPerVector
+        8,                      // ABlockTransferDstScalarPerVector_K1
+        false,                  // ABlockLdsAddExtraM
+        ck::Sequence<4, 64, 1>, // BBlockTransferThreadClusterLengths_K0_N_K1
+        ck::Sequence<1, 0, 2>,  // BBlockTransferThreadClusterArrangeOrder
+        ck::Sequence<1, 0, 2>,  // BBlockTransferSrcAccessOrder
+        2,                      // BBlockTransferSrcVectorDim
+        8,                      // BBlockTransferSrcScalarPerVector
+        8,                      // BBlockTransferDstScalarPerVector_K1
+        false,                  // BBlockLdsAddExtraN
+        1,                      // CShuffleMXdlPerWavePerShuffle
+        1,                      // CShuffleNXdlPerWavePerShuffle
+        ck::Sequence<1,
+                     32,
+                     1,
+                     8>, // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,               // CBlockTransferScalarPerVector_NWaveNPerXdl
+        ck::half_t,      // ComputeTypeA
+        ck::half_t,      // ComputeTypeB
+        1,               // MaxTransposeTransferSrcScalarPerVector
+        1>;              // MaxTransposeTransferDstScalarPerVector
+
+    std::string instance_str = ck_tile::reflect::instance_string<DeviceInstance>();
+
+    std::string expected_str = "DeviceGroupedConvBwdWeight_Xdl_CShuffle"
+                               "<2"             // NDimSpatial
+                               ",GNHWC"         // InLayout
+                               ",GKYXC"         // WeiLayout
+                               ",GNHWK"         // OutLayout
+                               ",fp16"          // InDataType
+                               ",fp16"          // WeiDataType
+                               ",fp16"          // OutDataType
+                               ",fp32"          // AccDataType
+                               ",PassThrough"   // InElementwiseOperation
+                               ",PassThrough"   // WeiElementwiseOperation
+                               ",PassThrough"   // OutElementwiseOperation
+                               ",Default"       // ConvBackwardWeightSpecialization
+                               ",256"           // BlockSize
+                               ",128"           // MPerBlock
+                               ",128"           // NPerBlock
+                               ",4"             // K0PerBlock
+                               ",8"             // K1
+                               ",32"            // MPerXDL
+                               ",32"            // NPerXDL
+                               ",2"             // MXdlPerWave
+                               ",2"             // NXdlPerWave
+                               ",Seq(4,64,1)"   // ABlockTransferThreadClusterLengths_K0_M_K1
+                               ",Seq(1,0,2)"    // ABlockTransferThreadClusterArrangeOrder
+                               ",Seq(1,0,2)"    // ABlockTransferSrcAccessOrder
+                               ",2"             // ABlockTransferSrcVectorDim
+                               ",8"             // ABlockTransferSrcScalarPerVector
+                               ",8"             // ABlockTransferDstScalarPerVector_K1
+                               ",false"         // ABlockLdsAddExtraM
+                               ",Seq(4,64,1)"   // BBlockTransferThreadClusterLengths_K0_N_K1
+                               ",Seq(1,0,2)"    // BBlockTransferThreadClusterArrangeOrder
+                               ",Seq(1,0,2)"    // BBlockTransferSrcAccessOrder
+                               ",2"             // BBlockTransferSrcVectorDim
+                               ",8"             // BBlockTransferSrcScalarPerVector
+                               ",8"             // BBlockTransferDstScalarPerVector_K1
+                               ",false"         // BBlockLdsAddExtraN
+                               ",1"             // CShuffleMXdlPerWavePerShuffle
+                               ",1"             // CShuffleNXdlPerWavePerShuffle
+                               ",Seq(1,32,1,8)" // CBlockTransferClusterLengths
+                               ",8"             // CBlockTransferScalarPerVector_NWaveNPerXdl
+                               ",fp16"          // ComputeTypeA
+                               ",fp16"          // ComputeTypeB
+                               ",1"             // MaxTransposeTransferSrcScalarPerVector
+                               ",1>";           // MaxTransposeTransferDstScalarPerVector
+
+    EXPECT_EQ(instance_str, expected_str);
+}
+
+} // anonymous namespace
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp
index 53199f1d77..73a731a69c 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp>
 
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp
index c68fdb4b24..75cb58018e 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_bnorm_clamp.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_bnorm_clamp.hpp>
 
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp
index a9e5e39e8f..b301bab966 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bias_clamp.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bias_clamp.hpp>
 
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp
index cc9cbd3dda..8b77b11e0c 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_bilinear.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_bilinear.hpp>
 #include "ck/utility/data_type.hpp"
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp
index a8d1d1763f..e678fa1258 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_clamp.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_clamp.hpp>
 #include "ck/utility/data_type.hpp"
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp
index 0da300e3d8..4882c6fde2 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_convscale.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp>
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp>
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp
index e918785fd7..7437385e2a 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_dynamic_op.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dynamic_op.hpp>
 #include "ck/utility/data_type.hpp"
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp
index 428d1c81f3..8144d7bedd 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scale.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scale.hpp>
 
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp
index 774c30c05e..beebc3f853 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_ab.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_ab.hpp>
 
diff --git a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp
index ba8726a643..79feb298cd 100644
--- a/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp
+++ b/experimental/builder/test/test_ck_factory_grouped_convolution_forward_scaleadd_scaleadd_relu.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_scaleadd_scaleadd_relu.hpp>
 
diff --git a/experimental/builder/test/test_conv_builder.cpp b/experimental/builder/test/test_conv_builder.cpp
index 4ec189daa4..27b30946b2 100644
--- a/experimental/builder/test/test_conv_builder.cpp
+++ b/experimental/builder/test/test_conv_builder.cpp
@@ -1,3 +1,6 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #include <gtest/gtest.h>
 
 class ConvBuilderTest : public ::testing::Test
diff --git a/experimental/builder/test/test_conv_description.cpp b/experimental/builder/test/test_conv_description.cpp
new file mode 100644
index 0000000000..97af4af795
--- /dev/null
+++ b/experimental/builder/test/test_conv_description.cpp
@@ -0,0 +1,169 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+
+#include <ck_tile/builder/conv_builder.hpp>
+#include <ck_tile/builder/reflect/conv_description.hpp>
+#include "testing_utils.hpp"
+#include "impl/conv_signature_types.hpp"
+#include "impl/conv_algorithm_types.hpp"
+
+namespace {
+
+namespace ckb = ck_tile::builder;
+namespace ckr = ck_tile::reflect::conv;
+namespace ckt = ck_tile::test;
+
+// Defines the signature of the convolution operation to be tested.
+// This includes dimensionality, direction, data layout, and data type.
+struct ConvSignature
+{
+    int spatial_dim                                 = 2;
+    ckb::ConvDirection direction                    = ckb::ConvDirection::FORWARD;
+    ckb::GroupConvLayout layout                     = ckb::GroupConvLayout2D::GNHWC_GKYXC_GNHWK;
+    ckb::DataType data_type                         = ckb::DataType::FP16;
+    ckb::ElementwiseOperation elementwise_operation = ckb::ElementwiseOperation::PASS_THROUGH;
+    ckb::GroupConvDeviceOp device_operation =
+        ckb::FwdGroupConvDeviceOperation::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3;
+};
+static_assert(ckb::ConvSignatureDescriptor<ConvSignature>);
+
+struct DefaultAlgorithm
+{
+    ckb::test::ThreadBlock thread_block{.block_size = 256,
+                                        .tile_size  = {.m = 256, .n = 256, .k = 32}};
+
+    ckb::test::GridwiseXdlGemm gridwise_gemm{.ak1            = 8,
+                                             .bk1            = 8,
+                                             .m_per_xdl      = 16,
+                                             .n_per_xdl      = 16,
+                                             .m_xdl_per_wave = 4,
+                                             .n_xdl_per_wave = 4};
+
+    ckb::test::BlockTransferABC block_transfer{
+        .block_transfer_a              = {.k0 = 4, .m_n = 256, .k1 = 8},
+        .block_transfer_b              = {.k0 = 4, .m_n = 256, .k1 = 8},
+        .thread_cluster_dims_c         = {.m_block        = 1,
+                                          .m_wave_per_xdl = 32,
+                                          .n_block        = 1,
+                                          .n_wave_per_xdl = 8},
+        .lds_transfer_a                = {.src_vector_dim            = 2,
+                                          .src_scalar_per_vector     = 8,
+                                          .lds_dst_scalar_per_vector = 8,
+                                          .is_direct_load            = true,
+                                          .lds_padding               = false},
+        .lds_transfer_b                = {.src_vector_dim            = 2,
+                                          .src_scalar_per_vector     = 8,
+                                          .lds_dst_scalar_per_vector = 8,
+                                          .is_direct_load            = true,
+                                          .lds_padding               = false},
+        .epilogue_c                    = {.m_per_wave_per_shuffle = 1,
+                                          .n_per_wave_per_shuffle = 1,
+                                          .scalar_per_vector      = 8},
+        .block_transfer_access_order_a = {.order = {0, 1, 2}},
+        .block_transfer_access_order_b = {.order = {0, 1, 2}},
+        .src_access_order_a            = {.order = {0, 1, 2}},
+        .src_access_order_b            = {.order = {0, 1, 2}}};
+
+    ckb::ConvFwdSpecialization fwd_specialization = ckb::ConvFwdSpecialization::DEFAULT;
+    ckb::GemmSpecialization gemm_specialization   = ckb::GemmSpecialization::Default;
+    ckb::test::BlockGemm block_gemm{.pipeline_version = ckb::PipelineVersion::V4,
+                                    .scheduler        = ckb::PipelineScheduler::INTRAWAVE};
+};
+static_assert(ckb::ConvAlgorithmDescriptor<DefaultAlgorithm>);
+
+TEST(ConvDescriptionTest, DefaultInstanceHasBriefDescription)
+{
+    static constexpr const ConvSignature SIGNATURE;
+    static constexpr const DefaultAlgorithm ALGORITHM;
+    using Builder = ckb::ConvBuilder<SIGNATURE, ALGORITHM>;
+    EXPECT_THAT(ckr::Describe<Builder>().brief(), ckt::StringEqWithDiff("2D Forward convolution"));
+}
+
+TEST(ConvDescriptionTest, DefaultInstanceHasDetailedDescription)
+{
+    static constexpr const ConvSignature SIGNATURE;
+    static constexpr const DefaultAlgorithm ALGORITHM;
+    using Builder = ckb::ConvBuilder<SIGNATURE, ALGORITHM>;
+    EXPECT_THAT(ckr::Describe<Builder>().detailed(),
+                ckt::StringEqWithDiff( //
+                    "2D Forward Convolution Kernel\n"
+                    "├─ Signature\n"
+                    "│  ├─ Tensor Type: FP16\n"
+                    "│  ├─ Memory Layout: GNHWC_GKYXC_GNHWK\n"
+                    "│  ├─ Input elementwise operation: PASS_THROUGH\n"
+                    "│  ├─ Weights elementwise operation: PASS_THROUGH\n"
+                    "│  └─ Output elementwise operation: PASS_THROUGH\n"
+                    "├─ Algorithm\n"
+                    "│  ├─ Thread block size: 256\n"
+                    "│  ├─ Data tile size: 256×256×32\n"
+                    "│  ├─ Gemm padding: DEFAULT\n"
+                    "│  ├─ Convolution specialization: DEFAULT\n"
+                    "│  ├─ Pipeline version: V4\n"
+                    "│  ├─ Pipeline scheduler: INTRAWAVE\n"
+                    "│  ├─ Warp Gemm parameters: \n"
+                    "│  │  ├─ subtile size: 16×16\n"
+                    "│  │  └─ Number of warp gemm iterations: 4×4\n"
+                    "│  ├─ Memory access:\n"
+                    "│  │  ├─ A Tile transfer: \n"
+                    "│  │  │  ├─ Tile dimensions: 4×256×8×\n"
+                    "│  │  │  ├─ The innermost K subdimension size: 8\n"
+                    "│  │  │  ├─ Spatial thread distribution over the data tile: 0×1×2\n"
+                    "│  │  │  ├─ The order of accessing data tile axes: 0×1×2\n"
+                    "│  │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
+                    "│  │  │  ├─ Vector access (GMEM read) instruction size: 8\n"
+                    "│  │  │  ├─ Vector access (LDS write) instruction size: 8\n"
+                    "│  │  │  └─ LDS data layout padding (to prevent bank conflicts): 8\n"
+                    "│  │  ├─ B Tile transfer: \n"
+                    "│  │  │  ├─ Tile dimensions: 4×256×8×\n"
+                    "│  │  │  ├─ The innermost K subdimension size: 8\n"
+                    "│  │  │  ├─ Spatial thread distribution over the data tile: 0×1×2\n"
+                    "│  │  │  ├─ The order of accessing data tile axes: 0×1×2\n"
+                    "│  │  │  ├─ Vectorized memory access axis index (with contiguous memory): 2\n"
+                    "│  │  │  ├─ Vector access (GMEM read) instruction size: 8\n"
+                    "│  │  │  ├─ Vector access (LDS write) instruction size: 8\n"
+                    "│  │  │  └─ LDS data layout padding (to prevent bank conflicts): 8\n"
+                    "│  │  └─ C Tile transfer: \n"
+                    "│  │     ├─ Data shuffle (number of gemm instructions per iteration): 1×1\n"
+                    "│  │     ├─ Spatial thread distribution used to store data: 1×32×1×8\n"
+                    "│  │     └─ Vector access (GMEM write) instruction size: 8\n"
+                    "│  └─ \n"
+                    "└─ "));
+}
+
+// NOTE: BackwardDataInstanceHasDetailedDescription test is disabled because ConvFactory
+// does not have a specialization for backward data convolutions. The test fails with:
+//   "implicit instantiation of undefined template 'ck_tile::builder::ConvFactory<...>'"
+//
+// To enable this test, a ConvFactory specialization for backward data operations must be
+// implemented first.
+//
+// TEST(ConvDescriptionTest, BackwardDataInstanceHasDetailedDescription)
+// {
+//     struct BackwardDataSignature
+//     {
+//         int spatial_dim              = 2;
+//         ckb::ConvDirection direction = ckb::ConvDirection::BACKWARD_DATA;
+//         ckb::GroupConvLayout layout  = ckb::GroupConvLayout2D::GNHWC_GKYXC_GNHWK;
+//         ckb::DataType data_type      = ckb::DataType::FP16;
+//         ckb::ElementwiseOperation elementwise_operation =
+//         ckb::ElementwiseOperation::PASS_THROUGH; ckb::GroupConvDeviceOp device_operation =
+//             ckb::BwdDataGroupConvDeviceOperation::DeviceGroupedConvBwdDataMultipleD_Xdl_CShuffle_v1;
+//     };
+//     static_assert(ckb::ConvSignatureDescriptor<BackwardDataSignature>);
+//
+//     static constexpr const BackwardDataSignature SIGNATURE;
+//     static constexpr const DefaultAlgorithm ALGORITHM;
+//     using Builder = ckb::ConvBuilder<SIGNATURE, ALGORITHM>;
+//
+//     // Verify Brief works
+//     EXPECT_THAT(ckr::Describe<Builder>().brief(),
+//                 ckt::StringEqWithDiff("2D Backward Data convolution"));
+//
+//     // Verify detailed works - to be updated once ConvFactory is implemented
+//     EXPECT_THAT(ckr::Describe<Builder>().detailed(),
+//                 ckt::StringEqWithDiff("PLACEHOLDER"));
+// }
+} // namespace
diff --git a/experimental/builder/test/test_fwd_instance_traits.cpp b/experimental/builder/test/test_fwd_instance_traits.cpp
index 2e3b6ac264..af950b441c 100644
--- a/experimental/builder/test/test_fwd_instance_traits.cpp
+++ b/experimental/builder/test/test_fwd_instance_traits.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
@@ -11,6 +11,7 @@
 #include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp>
 #include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_wmma_cshuffle.hpp>
 #include <ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_dl_multiple_d_nhwc_kyxc_nhwk.hpp>
+#include <ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp>
 
 namespace {
 
@@ -720,4 +721,126 @@ TEST(InstanceTraits, DlInstanceStringReturnsCorrectFormat)
     EXPECT_EQ(instance_str, expected_str);
 }
 
+TEST(InstanceTraits, TileInstanceStringReturnsCorrectFormat)
+{
+    using GroupedConvTraitsType =
+        ck_tile::GroupedConvTraits<2 /*NDimSpatial*/,
+                                   ck_tile::ConvolutionSpecialization::Default /*ConvSpec*/,
+                                   ck_tile::tensor_layout::convolution::NHWGC /*InLayout*/,
+                                   ck_tile::tensor_layout::convolution::GKYXC /*WeiLayout*/,
+                                   ck_tile::tuple<> /*DsLayout*/,
+                                   ck_tile::tensor_layout::convolution::NHWGK /*OutLayout*/,
+                                   4 /*VectorSizeA*/,
+                                   4 /*VectorSizeB*/,
+                                   4 /*VectorSizeC*/,
+                                   1 /*NumGroupsToMerge*/,
+                                   false /*EnableSplitImage*/>;
+
+    using GemmShape = ck_tile::TileGemmShape<
+        ck_tile::sequence<128 /*M_Tile*/, 128 /*N_Tile*/, 32 /*K_Tile*/>,
+        ck_tile::sequence<4 /*M_Warp*/, 1 /*N_Warp*/, 1 /*K_Warp*/>,
+        ck_tile::sequence<16 /*M_Warp_Tile*/, 16 /*N_Warp_Tile*/, 16 /*K_Warp_Tile*/>>;
+
+    using TilePartitioner = ck_tile::GemmSpatiallyLocalTilePartitioner<
+        GemmShape,
+        GroupedConvTraitsType::FixedGemmParams::TilePartitionerGroupNum,
+        GroupedConvTraitsType::FixedGemmParams::TilePartitionerM01>;
+
+    using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
+        GroupedConvTraitsType::FixedGemmParams::kPadM,
+        GroupedConvTraitsType::FixedGemmParams::kPadN,
+        GroupedConvTraitsType::FixedGemmParams::kPadK,
+        false /*DoubleSmemBuffer*/,
+        typename GroupedConvTraitsType::AsLayoutFwd,
+        typename GroupedConvTraitsType::BsLayoutFwd,
+        typename GroupedConvTraitsType::CLayoutFwd,
+        GroupedConvTraitsType::FixedGemmParams::TransposeC,
+        GroupedConvTraitsType::FixedGemmParams::UseStructuredSparsity,
+        GroupedConvTraitsType::FixedGemmParams::Persistent,
+        1 /*NumWaveGroups*/>;
+
+    using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<
+        ck_tile::bf16_t /*InDataType*/,
+        ck_tile::bf16_t /*WeiDataType*/,
+        float /*AccDataType*/,
+        GemmShape,
+        GemmUniversalTraits,
+        ck_tile::GemmPipelineScheduler::Intrawave /*scheduler*/,
+        true /*has_hot_loop_v*/,
+        ck_tile::TailNumber::Full /*tail_number_v*/,
+        ck_tile::element_wise::PassThrough /*AElementwiseOperation*/,
+        ck_tile::element_wise::PassThrough /*BElementwiseOperation*/,
+        ck_tile::bf16_t /*OutDataType*/,
+        GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+        GroupedConvTraitsType::VectorSizeA,
+        GroupedConvTraitsType::VectorSizeB>;
+
+    using GemmPipeline = typename ck_tile::GemmPipelineAgBgCrCompV3<UniversalGemmProblem>;
+
+    using ConvEpilogue = ck_tile::CShuffleEpilogue<
+        ck_tile::CShuffleEpilogueProblem<ck_tile::bf16_t /*InDataType*/,
+                                         ck_tile::bf16_t /*WeiDataType*/,
+                                         ck_tile::tuple<> /*DsDataType*/,
+                                         float /*AccDataType*/,
+                                         ck_tile::bf16_t /*OutDataType*/,
+                                         typename GroupedConvTraitsType::ImplicitGemmDsLayout,
+                                         typename GroupedConvTraitsType::FixedGemmParams::ELayout,
+                                         ck_tile::element_wise::PassThrough /*CDElementWise*/,
+                                         128 /*MPerBlock*/,
+                                         128 /*NPerBlock*/,
+                                         4 /*M_Warp*/,
+                                         1 /*N_Warp*/,
+                                         16 /*M_Warp_Tile*/,
+                                         16 /*N_Warp_Tile*/,
+                                         16 /*K_Warp_Tile*/,
+                                         GroupedConvTraitsType::FixedGemmParams::TransposeC,
+                                         ck_tile::memory_operation_enum::set /*memory_operation*/,
+                                         1 /*kNumWaveGroups*/,
+                                         GroupedConvTraitsType::FixedGemmParams::FixedVectorSize,
+                                         GroupedConvTraitsType::VectorSizeC>>;
+
+    using GroupedConvFwdKernel =
+        ck_tile::device::GroupedConvolutionForwardKernel<GroupedConvTraitsType,
+                                                         TilePartitioner,
+                                                         GemmPipeline,
+                                                         ConvEpilogue>;
+
+    std::string instance_str = ck_tile::reflect::instance_string<GroupedConvFwdKernel>();
+
+    std::string expected_str = "GroupedConvolutionForwardKernel"
+                               "<2"           // NDimSpatial
+                               ",Default"     // ConvSpecialization
+                               ",NHWGC"       // InLayout
+                               ",GKYXC"       // WeiLayout
+                               ",EmptyTuple"  // DsLayout
+                               ",NHWGK"       // OutLayout
+                               ",4"           // VectorSizeA
+                               ",4"           // VectorSizeB
+                               ",4"           // VectorSizeC
+                               ",1"           // NumGroupsToMerge
+                               ",0"           // EnableSplitImage
+                               ",128"         // MPerBlock
+                               ",128"         // NPerBlock
+                               ",32"          // KPerBlock
+                               ",4"           // MWarp
+                               ",1"           // NWarp
+                               ",1"           // KWarp
+                               ",16"          // MWarpTile
+                               ",16"          // NWarpTile
+                               ",16"          // KWarpTile
+                               ",bf16"        // ADataType
+                               ",bf16"        // BDataType
+                               ",COMPUTE_V3"  // BlkGemmPipelineVer
+                               ",Intrawave"   // BlkGemmPipeSched
+                               ",0"           // DoubleSmemBuffer
+                               ",1"           // NumWaveGroups
+                               ",fp32"        // AccDataType
+                               ",bf16"        // EDataType
+                               ",EmptyTuple"  // DsDataType
+                               ",PassThrough" // CDEElementwiseOperation
+                               ">";
+
+    EXPECT_EQ(instance_str, expected_str);
+}
+
 } // anonymous namespace
diff --git a/experimental/builder/test/test_get_instance_string_bwd_weight_grp_conv_xdl.cpp b/experimental/builder/test/test_get_instance_string_bwd_weight_grp_conv_xdl.cpp
new file mode 100644
index 0000000000..68b43c6a99
--- /dev/null
+++ b/experimental/builder/test/test_get_instance_string_bwd_weight_grp_conv_xdl.cpp
@@ -0,0 +1,86 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#include <gtest/gtest.h>
+#include <ck_tile/builder/reflect/instance_traits.hpp>
+#include <ck/tensor_operation/gpu/device/device_base.hpp>
+#include <ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp>
+
+// Test GetInstanceString through base class pointer for backward weight XDL variant
+TEST(GetInstanceString, ReturnsStringForBwdWeightGrpConvXdlInstance)
+{
+    // Use the template helper to get a working instance configuration
+    using InstanceTuple = ck::tensor_operation::device::instance::
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
+            2,                                             // NDimSpatial
+            ck::tensor_operation::device::instance::GNHWC, // InLayout
+            ck::tensor_operation::device::instance::GKYXC, // WeiLayout
+            ck::tensor_operation::device::instance::GNHWK, // OutLayout
+            ck::tensor_operation::device::instance::
+                ConvBwdWeightDefault>; // ConvBwdWeightSpecialization
+
+    // Get the first instance from the tuple
+    using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
+
+    // Define the base class type using the most general operator base
+    using BaseClass = ck::tensor_operation::device::BaseOperator;
+
+    // Create an instance of the derived class
+    DeviceInstance device_instance;
+
+    // Get a pointer to the base class
+    BaseClass* base_ptr = &device_instance;
+
+    // Call GetInstanceString through the base class pointer
+    std::string instance_str = base_ptr->GetInstanceString();
+
+    // Expected complete instance string based on the first instance from
+    // device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances
+    // This corresponds to the configuration with BlockSize=64, MPerBlock=64, NPerBlock=64, etc.
+    std::string expected_str = "DeviceGroupedConvBwdWeight_Xdl_CShuffle"
+                               "<2"             // NDimSpatial
+                               ",GNHWC"         // InLayout
+                               ",GKYXC"         // WeiLayout
+                               ",GNHWK"         // OutLayout
+                               ",fp16"          // InDataType
+                               ",fp16"          // WeiDataType
+                               ",fp16"          // OutDataType
+                               ",fp32"          // AccDataType
+                               ",PassThrough"   // InElementwiseOperation
+                               ",PassThrough"   // WeiElementwiseOperation
+                               ",PassThrough"   // OutElementwiseOperation
+                               ",Default"       // ConvBackwardWeightSpecialization
+                               ",64"            // BlockSize
+                               ",64"            // MPerBlock
+                               ",64"            // NPerBlock
+                               ",4"             // K0PerBlock
+                               ",8"             // K1
+                               ",32"            // MPerXDL
+                               ",32"            // NPerXDL
+                               ",2"             // MXdlPerWave
+                               ",2"             // NXdlPerWave
+                               ",Seq(1,4,8,2)"  // ABlockTransferThreadClusterLengths_K0_M_K1
+                               ",Seq(0,3,1,2)"  // ABlockTransferThreadClusterArrangeOrder
+                               ",Seq(0,2,1,3)"  // ABlockTransferSrcAccessOrder
+                               ",2"             // ABlockTransferSrcVectorDim
+                               ",2"             // ABlockTransferSrcScalarPerVector
+                               ",4"             // ABlockTransferDstScalarPerVector_K1
+                               ",true"          // ABlockLdsAddExtraM
+                               ",Seq(1,4,8,2)"  // BBlockTransferThreadClusterLengths_K0_N_K1
+                               ",Seq(0,3,1,2)"  // BBlockTransferThreadClusterArrangeOrder
+                               ",Seq(0,2,1,3)"  // BBlockTransferSrcAccessOrder
+                               ",2"             // BBlockTransferSrcVectorDim
+                               ",2"             // BBlockTransferSrcScalarPerVector
+                               ",4"             // BBlockTransferDstScalarPerVector_K1
+                               ",true"          // BBlockLdsAddExtraN
+                               ",1"             // CShuffleMXdlPerWavePerShuffle
+                               ",1"             // CShuffleNXdlPerWavePerShuffle
+                               ",Seq(1,16,1,4)" // CBlockTransferClusterLengths
+                               ",2"             // CBlockTransferScalarPerVector_NWaveNPerXdl
+                               ",fp16"          // ComputeTypeA
+                               ",fp16"          // ComputeTypeB
+                               ",1"             // MaxTransposeTransferSrcScalarPerVector
+                               ",1>";           // MaxTransposeTransferDstScalarPerVector
+
+    EXPECT_EQ(instance_str, expected_str);
+}
diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp
index 7c43c428ce..ca683abedc 100644
--- a/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp
+++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <gtest/gtest.h>
 #include <ck_tile/builder/reflect/instance_traits.hpp>
diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_dl.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_dl.cpp
index 54e026b308..abf55322b6 100644
--- a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_dl.cpp
+++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_dl.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <gtest/gtest.h>
 #include <ck_tile/builder/reflect/instance_traits.hpp>
diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp
index fd75fd9e6b..d6900d041f 100644
--- a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp
+++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <gtest/gtest.h>
 #include <ck_tile/builder/reflect/instance_traits.hpp>
diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp
index 45ab5db322..67e55173de 100644
--- a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp
+++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <gtest/gtest.h>
 #include <ck_tile/builder/reflect/instance_traits.hpp>
diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_wmma.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_wmma.cpp
index e1505659f5..ccce080f9d 100644
--- a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_wmma.cpp
+++ b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_wmma.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <gtest/gtest.h>
 #include <ck_tile/builder/reflect/instance_traits.hpp>
diff --git a/experimental/builder/test/test_inline_diff.cpp b/experimental/builder/test/test_inline_diff.cpp
index 41692fb40e..7a1b0ab8c3 100644
--- a/experimental/builder/test/test_inline_diff.cpp
+++ b/experimental/builder/test/test_inline_diff.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <gtest/gtest.h>
 
diff --git a/experimental/builder/test/test_instance_traits_util.cpp b/experimental/builder/test/test_instance_traits_util.cpp
index 04d3cb978b..5623028699 100644
--- a/experimental/builder/test/test_instance_traits_util.cpp
+++ b/experimental/builder/test/test_instance_traits_util.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
diff --git a/experimental/builder/test/test_testing_utils.cpp b/experimental/builder/test/test_testing_utils.cpp
index 24a1c9bc81..89d8c06e59 100644
--- a/experimental/builder/test/test_testing_utils.cpp
+++ b/experimental/builder/test/test_testing_utils.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp>
 
diff --git a/experimental/builder/test/testing_utils.cpp b/experimental/builder/test/testing_utils.cpp
index 34793b601e..02663317f1 100644
--- a/experimental/builder/test/testing_utils.cpp
+++ b/experimental/builder/test/testing_utils.cpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "testing_utils.hpp"
 #include <gtest/gtest.h>
diff --git a/experimental/builder/test/testing_utils.hpp b/experimental/builder/test/testing_utils.hpp
index 3ff2eb32de..ae5811bd4b 100644
--- a/experimental/builder/test/testing_utils.hpp
+++ b/experimental/builder/test/testing_utils.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <ck/library/tensor_operation_instance/device_operation_instance_factory.hpp>
 #include <gtest/gtest.h>
diff --git a/experimental/builder/test/utils/ckb_conv_test_common.hpp b/experimental/builder/test/utils/ckb_conv_test_common.hpp
index d18a008015..14fae566f6 100644
--- a/experimental/builder/test/utils/ckb_conv_test_common.hpp
+++ b/experimental/builder/test/utils/ckb_conv_test_common.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -16,7 +16,7 @@ using namespace test;
 // Common test implementation
 template <ConvSignature FwdConvSignature,
           ThreadBlock FwdThreadBlock,
-          BlockGemmPipelineVersion FwdPipelineVersion,
+          PipelineVersion FwdPipelineVersion,
           ConvFwdSpecialization FwdConvSpecialization>
 constexpr void run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3()
 {
@@ -52,7 +52,7 @@ constexpr void run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3()
                                                 .src_access_order_b            = {1, 0, 2}};
 
     constexpr BlockGemm BlockGemmDesc = {.pipeline_version = FwdPipelineVersion,
-                                         .scheduler        = BlockGemmPipelineScheduler::INTRAWAVE};
+                                         .scheduler        = PipelineScheduler::INTRAWAVE};
 
     constexpr ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3 FwdConvAlgorithm{
         .thread_block        = FwdThreadBlock,
@@ -73,13 +73,13 @@ constexpr void run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3()
     EXPECT_TRUE(kernel_string.starts_with("DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3"));
 
     // Verify pipeline version is correct
-    if(FwdPipelineVersion == BlockGemmPipelineVersion::V1)
+    if(FwdPipelineVersion == PipelineVersion::V1)
         EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v1") != std::string::npos);
-    else if(FwdPipelineVersion == BlockGemmPipelineVersion::V3)
+    else if(FwdPipelineVersion == PipelineVersion::V3)
         EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v3") != std::string::npos);
-    else if(FwdPipelineVersion == BlockGemmPipelineVersion::V4)
+    else if(FwdPipelineVersion == PipelineVersion::V4)
         EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v4") != std::string::npos);
-    else if(FwdPipelineVersion == BlockGemmPipelineVersion::V5)
+    else if(FwdPipelineVersion == PipelineVersion::V5)
         EXPECT_TRUE(kernel_string.find("BlkGemmPipelineVersion: v5") != std::string::npos);
 
     // Verify specialization is correct
@@ -140,7 +140,7 @@ constexpr void run_test_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle()
         .gemm_specialization        = GemmSpecialization::MNKPadding,
         .num_gemm_k_prefetch_stages = 1,
         .num_groups_to_merge        = 2,
-        .loop_scheduler             = LoopScheduler::DEFAULT};
+        .loop_scheduler             = PipelineScheduler::DEFAULT};
 
     using Builder = ConvBuilder<FwdConvSignature, FwdConvAlgorithm>;
 
@@ -176,7 +176,7 @@ constexpr void run_test_DeviceGroupedConvFwdMultipleD_Wmma_CShuffle()
                                              .n_per_wmma       = 32,
                                              .m_wmma_per_wave  = 2,
                                              .n_wmma_per_wave  = 1,
-                                             .pipeline_version = GridwiseGemmPipelineVersion::V1};
+                                             .pipeline_version = PipelineVersion::V1};
 
     constexpr BlockTransferABC FwdBlockTransfer{.block_transfer_a = {.k0 = 4, .m_n = 32, .k1 = 1},
                                                 .block_transfer_b = {.k0 = 4, .m_n = 32, .k1 = 1},
@@ -209,7 +209,7 @@ constexpr void run_test_DeviceGroupedConvFwdMultipleD_Wmma_CShuffle()
         .fwd_specialization         = FwdConvSpecialization,
         .gemm_specialization        = GemmSpecialization::MNKPadding,
         .num_gemm_k_prefetch_stages = 1,
-        .loop_scheduler             = LoopScheduler::DEFAULT};
+        .loop_scheduler             = PipelineScheduler::DEFAULT};
 
     using Builder = ConvBuilder<FwdConvSignature, FwdConvAlgorithm>;
 
@@ -235,4 +235,149 @@ constexpr void run_test_DeviceGroupedConvFwdMultipleD_Wmma_CShuffle()
     EXPECT_NE(invoker_ptr, nullptr);
 }
 
+template <ConvSignature FwdConvSignature,
+          ThreadBlock FwdThreadBlock,
+          ConvFwdSpecialization FwdConvSpecialization>
+constexpr void run_test_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK()
+{
+    // DL thread configuration
+    constexpr DlThreadConfig DlThreadCfg{
+        .k0_per_block = 16, .k1 = 2, .m1_per_thread = 4, .n1_per_thread = 4, .k_per_thread = 1};
+
+    // DL thread cluster
+    constexpr DlThreadCluster DlCluster{.m1_xs = {8, 2}, .n1_xs = {8, 2}};
+
+    // DL A block transfer - K0_M0_M1_K1 format
+    constexpr DlBlockTransferK0M0M1K1 DlBlockTransferA{
+        .thread_slice_lengths                   = {8, 1, 1, 2},
+        .thread_cluster_lengths                 = {2, 1, 128, 1},
+        .thread_cluster_arrange_order           = {1, 2, 0, 3},
+        .src_access_order                       = {1, 2, 0, 3},
+        .src_vector_tensor_lengths              = {4, 1, 1, 2},
+        .src_vector_tensor_contiguous_dim_order = {1, 2, 0, 3},
+        .dst_vector_tensor_lengths              = {1, 1, 1, 2}};
+
+    // DL B block transfer - K0_N0_N1_K1 format
+    constexpr DlBlockTransferK0N0N1K1 DlBlockTransferB{
+        .thread_slice_lengths                   = {8, 1, 1, 2},
+        .thread_cluster_lengths                 = {2, 1, 128, 1},
+        .thread_cluster_arrange_order           = {1, 2, 0, 3},
+        .src_access_order                       = {1, 2, 0, 3},
+        .src_vector_tensor_lengths              = {4, 1, 1, 2},
+        .src_vector_tensor_contiguous_dim_order = {1, 2, 0, 3},
+        .dst_vector_tensor_lengths              = {1, 1, 1, 2}};
+
+    // DL C thread transfer
+    constexpr DlCThreadTransfer DlCTransfer{.src_dst_access_order  = {0, 1, 2, 3, 4, 5},
+                                            .src_dst_vector_dim    = 5,
+                                            .dst_scalar_per_vector = 4};
+
+    constexpr ConvAlgorithm_DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK FwdConvAlgorithm{
+        .thread_block         = FwdThreadBlock,
+        .fwd_specialization   = FwdConvSpecialization,
+        .gemm_specialization  = GemmSpecialization::MNKPadding,
+        .dl_thread_config     = DlThreadCfg,
+        .dl_thread_cluster    = DlCluster,
+        .dl_block_transfer_a  = DlBlockTransferA,
+        .dl_block_transfer_b  = DlBlockTransferB,
+        .dl_c_thread_transfer = DlCTransfer};
+
+    using Builder = ConvBuilder<FwdConvSignature, FwdConvAlgorithm>;
+
+    auto instance = typename Builder::Instance{};
+
+    const auto kernel_string = instance.GetTypeString();
+    std::cout << "Generated kernel: " << kernel_string << std::endl;
+    EXPECT_GT(kernel_string.size(), 0);
+
+    EXPECT_TRUE(kernel_string.starts_with("DeviceGroupedConvFwdDlMultipleD_NHWC_KYXC_NHWK"));
+
+    // Verify specialization is correct
+    if(FwdConvSpecialization == ConvFwdSpecialization::DEFAULT)
+        EXPECT_TRUE(kernel_string.find("Default") != std::string::npos);
+    else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_PAD0)
+        EXPECT_TRUE(kernel_string.find("Filter1x1Pad0") != std::string::npos);
+    else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0)
+        EXPECT_TRUE(kernel_string.find("Filter1x1Stride1Pad0") != std::string::npos);
+    else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_3x3)
+        EXPECT_TRUE(kernel_string.find("Filter3x3") != std::string::npos);
+
+    const auto invoker_ptr = instance.MakeInvokerPointer();
+    EXPECT_NE(invoker_ptr, nullptr);
+}
+
+// Test helper for DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
+// Note: Large_Tensor has identical parameters to regular XDL CShuffle
+template <ConvSignature FwdConvSignature,
+          ThreadBlock FwdThreadBlock,
+          ConvFwdSpecialization FwdConvSpecialization>
+constexpr void run_test_DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor()
+{
+    constexpr GridwiseXdlGemm FwdGemmParams{.ak1            = 8,
+                                            .bk1            = 8,
+                                            .m_per_xdl      = 32,
+                                            .n_per_xdl      = 32,
+                                            .m_xdl_per_wave = 2,
+                                            .n_xdl_per_wave = 1};
+
+    constexpr BlockTransferABC FwdBlockTransfer{.block_transfer_a = {.k0 = 4, .m_n = 16, .k1 = 1},
+                                                .block_transfer_b = {.k0 = 4, .m_n = 16, .k1 = 1},
+                                                .thread_cluster_dims_c = {.m_block        = 1,
+                                                                          .m_wave_per_xdl = 16,
+                                                                          .n_block        = 1,
+                                                                          .n_wave_per_xdl = 4},
+                                                .lds_transfer_a        = {.src_vector_dim            = 2,
+                                                                          .src_scalar_per_vector     = 8,
+                                                                          .lds_dst_scalar_per_vector = 8,
+                                                                          .is_direct_load = false,
+                                                                          .lds_padding    = true},
+                                                .lds_transfer_b        = {.src_vector_dim            = 2,
+                                                                          .src_scalar_per_vector     = 8,
+                                                                          .lds_dst_scalar_per_vector = 8,
+                                                                          .is_direct_load = false,
+                                                                          .lds_padding    = true},
+                                                .epilogue_c = {.m_per_wave_per_shuffle = 1,
+                                                               .n_per_wave_per_shuffle = 1,
+                                                               .scalar_per_vector      = 8},
+                                                .block_transfer_access_order_a = {1, 0, 2},
+                                                .block_transfer_access_order_b = {1, 0, 2},
+                                                .src_access_order_a            = {1, 0, 2},
+                                                .src_access_order_b            = {1, 0, 2}};
+
+    // Large_Tensor uses the same descriptor as regular XDL CShuffle
+    constexpr ConvAlgorithm_DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle FwdConvAlgorithm{
+        .thread_block               = FwdThreadBlock,
+        .gridwise_gemm              = FwdGemmParams,
+        .block_transfer             = FwdBlockTransfer,
+        .fwd_specialization         = FwdConvSpecialization,
+        .gemm_specialization        = GemmSpecialization::MNKPadding,
+        .num_gemm_k_prefetch_stages = 1,
+        .num_groups_to_merge        = 1,
+        .loop_scheduler             = LoopScheduler::DEFAULT};
+
+    using Builder = ConvBuilder<FwdConvSignature, FwdConvAlgorithm>;
+
+    auto instance = typename Builder::Instance{};
+
+    const auto kernel_string = instance.GetTypeString();
+    std::cout << "Generated kernel: " << kernel_string << std::endl;
+    EXPECT_GT(kernel_string.size(), 0);
+
+    EXPECT_TRUE(
+        kernel_string.starts_with("DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor"));
+
+    // Verify specialization is correct
+    if(FwdConvSpecialization == ConvFwdSpecialization::DEFAULT)
+        EXPECT_TRUE(kernel_string.find("Default") != std::string::npos);
+    else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_PAD0)
+        EXPECT_TRUE(kernel_string.find("Filter1x1Pad0") != std::string::npos);
+    else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_1X1_STRIDE1_PAD0)
+        EXPECT_TRUE(kernel_string.find("Filter1x1Stride1Pad0") != std::string::npos);
+    else if(FwdConvSpecialization == ConvFwdSpecialization::FILTER_3x3)
+        EXPECT_TRUE(kernel_string.find("Filter3x3") != std::string::npos);
+
+    const auto invoker_ptr = instance.MakeInvokerPointer();
+    EXPECT_NE(invoker_ptr, nullptr);
+}
+
 } // namespace ck_tile::builder::test_utils
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
index 8cff087ddb..89952910e6 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmma_selector.hpp
@@ -28,6 +28,7 @@ template <BlockGemmPipelineVersion BlkGemmPipelineVer,
           index_t MRepeat,
           index_t NRepeat,
           index_t KPack,
+          index_t KInner,
           bool TransposeC = false>
 constexpr auto BlockGemmPipeline_Selector()
 {
@@ -52,6 +53,7 @@ constexpr auto BlockGemmPipeline_Selector()
                                                 MRepeat,
                                                 NRepeat,
                                                 KPack,
+                                                KInner,
                                                 TransposeC>{};
     }
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
@@ -75,6 +77,7 @@ constexpr auto BlockGemmPipeline_Selector()
                                                 MRepeat,
                                                 NRepeat,
                                                 KPack,
+                                                KInner,
                                                 TransposeC>{};
     }
     else
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
index 265db9166a..abc9720714 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_base.hpp
@@ -30,6 +30,7 @@ template <index_t BlockSize,
           index_t MRepeat,
           index_t NRepeat,
           index_t KPack,
+          index_t KInner,
           bool TransposeC = false>
 struct BlockwiseGemmWmmaops_pipeline_base
 {
@@ -38,6 +39,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
     static constexpr auto I2 = Number<2>{};
     static constexpr auto I3 = Number<3>{};
     static constexpr auto I5 = Number<5>{};
+    static constexpr auto I6 = Number<6>{};
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
@@ -54,15 +56,20 @@ struct BlockwiseGemmWmmaops_pipeline_base
     static constexpr index_t B_KRow = 1;
 #endif
 
-    static constexpr index_t A_K1 = AWmmaTileDesc{}.GetLength(I5);
-    static constexpr index_t B_K1 = BWmmaTileDesc{}.GetLength(I5);
+    static constexpr auto wmma_gemm = WmmaGemm<ComputeTypeA,
+                                               ComputeTypeB,
+                                               AccDataType,
+                                               MPerWmma,
+                                               NPerWmma,
+                                               KPack / KInner,
+                                               TransposeC>{};
+
+    static constexpr index_t KPerThread = wmma_gemm.wmma_instr.k_per_blk * KInner;
+    static constexpr index_t A_K1       = ck::math::min(AWmmaTileDesc{}.GetLength(I6), KPerThread);
+    static constexpr index_t B_K1       = ck::math::min(BWmmaTileDesc{}.GetLength(I6), KPerThread);
 
     static_assert(KPack % (A_K1 * A_KRow) == 0, "wrong!");
     static_assert(KPack % (B_K1 * B_KRow) == 0, "wrong!");
-
-    static constexpr auto wmma_gemm =
-        WmmaGemm<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma, KPack, TransposeC>{};
-
     static constexpr index_t KRepeat = KPerBlock / KPack;
 
     static constexpr auto WmmaK = Number<wmma_gemm.wmma_instr.k_per_wmma>{};
@@ -191,8 +198,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
         const auto wmma_krow = 0;
 #endif
 
-        //  |KRepeat   |MRepeat|MWave    |KRow  |MLane  |KPack
-        return make_tuple(0, 0, waveId_m, wmma_krow, wmma_a_idx, 0);
+        return make_tuple(0, 0, 0, waveId_m, wmma_krow, wmma_a_idx, 0);
     }
 
     __device__ static auto CalculateBThreadOriginDataIndex()
@@ -209,8 +215,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
         const auto wmma_krow = 0;
 #endif
 
-        //  |KRepeat   |NRepeat|Nwave     |KRow  |NLane  |KPack
-        return make_tuple(0, 0, waveId_n, wmma_krow, wmma_b_idx, 0);
+        return make_tuple(0, 0, 0, waveId_n, wmma_krow, wmma_b_idx, 0);
     }
 
     template <index_t m0, index_t n0>
@@ -241,7 +246,7 @@ struct BlockwiseGemmWmmaops_pipeline_base
         return make_tuple(c_thread_m, c_thread_n);
     }
 
-    using Tuple6 = decltype(CalculateAThreadOriginDataIndex());
+    using Tuple7 = decltype(CalculateAThreadOriginDataIndex());
 
     /**
      * @brief Constructor for BlockwiseGemmWmmaops_pipeline_base.
@@ -261,8 +266,8 @@ struct BlockwiseGemmWmmaops_pipeline_base
      * repeat dimensions.
      */
     __host__ __device__
-    BlockwiseGemmWmmaops_pipeline_base(Tuple6 a_origin = CalculateAThreadOriginDataIndex(),
-                                       Tuple6 b_origin = CalculateBThreadOriginDataIndex())
+    BlockwiseGemmWmmaops_pipeline_base(Tuple7 a_origin = CalculateAThreadOriginDataIndex(),
+                                       Tuple7 b_origin = CalculateBThreadOriginDataIndex())
         : a_thread_copy_(a_origin), b_thread_copy_(b_origin)
     {
         static_assert(AWmmaTileDesc::IsKnownAtCompileTime() &&
@@ -343,12 +348,14 @@ struct BlockwiseGemmWmmaops_pipeline_base
                                                 Number<KRepeat>{},
                                                 I1,
                                                 I1,
+                                                I1,
                                                 Number<A_K1>{}),
                                      make_tuple(Number<A_K1>{},
                                                 Number<KPack / A_KRow>{},
                                                 Number<KPack / A_KRow * MRepeat>{},
                                                 I0,
                                                 I0,
+                                                I0,
                                                 I1));
 
     static constexpr auto b_thread_desc_ =
@@ -357,12 +364,14 @@ struct BlockwiseGemmWmmaops_pipeline_base
                                                 Number<KRepeat>{},
                                                 I1,
                                                 I1,
+                                                I1,
                                                 Number<B_K1>{}),
                                      make_tuple(Number<B_K1>{},
                                                 Number<KPack / B_KRow>{},
                                                 Number<KPack / B_KRow * NRepeat>{},
                                                 I0,
                                                 I0,
+                                                I0,
                                                 I1));
 
     // C[M, N, NumRegWmma]
@@ -374,9 +383,9 @@ struct BlockwiseGemmWmmaops_pipeline_base
                                          ComputeTypeA,
                                          decltype(a_block_desc_k0_m0_m1_m2_k1),
                                          decltype(a_thread_desc_),
-                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
-                                         Sequence<0, 1, 2, 3, 4, 5>,
-                                         5,
+                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, 1, A_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                         6,
                                          A_K1,
                                          A_K1>;
 
@@ -385,9 +394,9 @@ struct BlockwiseGemmWmmaops_pipeline_base
                                          ComputeTypeB,
                                          decltype(b_block_desc_k0_n0_n1_n2_k1),
                                          decltype(b_thread_desc_),
-                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
-                                         Sequence<0, 1, 2, 3, 4, 5>,
-                                         5,
+                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, 1, B_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                         6,
                                          B_K1,
                                          B_K1>;
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
index 5d7c570428..5f731933e2 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v1.hpp
@@ -32,6 +32,7 @@ template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
           index_t MRepeat,
           index_t NRepeat,
           index_t KPack,
+          index_t KInner,
           bool TransposeC = false>
 struct BlockwiseGemmWmmaops_pipeline_v1
 {
@@ -55,6 +56,7 @@ template <index_t BlockSize,
           index_t MRepeat,
           index_t NRepeat,
           index_t KPack,
+          index_t KInner,
           bool TransposeC>
 struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                         BlockSize,
@@ -75,6 +77,7 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                         MRepeat,
                                         NRepeat,
                                         KPack,
+                                        KInner,
                                         TransposeC>
     : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
                                          ADataType,
@@ -94,6 +97,7 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                          MRepeat,
                                          NRepeat,
                                          KPack,
+                                         KInner,
                                          TransposeC>
 {
     using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
@@ -114,10 +118,10 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                                     MRepeat,
                                                     NRepeat,
                                                     KPack,
+                                                    KInner,
                                                     TransposeC>;
     using Base::I0;
     using Base::I1;
-    using Base::WaveSize;
     using typename Base::HotLoopInstList;
 
     using Base::A_K1;
@@ -187,6 +191,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                         index_t num_loop,
                         index_t num_loop_per_scale) const
     {
+        constexpr index_t KPerWaveBlock = wmma_gemm.GetKPerWaveBlk();
+
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
             a_thread_desc_.GetElementSpaceSize());
         auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
@@ -211,27 +217,23 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
         auto blockwise_gemm_func = [&]() {
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
-                    a_thread_copy_.Run(
-                        a_block_desc_k0_m0_m1_m2_k1,
-                        make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
-                        a_block_buf,
-                        a_thread_desc_,
-                        make_tuple(I0, I0, I0, I0, I0, I0),
-                        a_thread_buf);
-
+                    a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                                       make_tuple(I0, m0, k0, I0, I0, I0, I0),
+                                       a_block_buf,
+                                       a_thread_desc_,
+                                       make_tuple(I0, I0, I0, I0, I0, I0, I0),
+                                       a_thread_buf);
                     if constexpr(m0 == I0)
                     {
                         if constexpr(ck::is_same<BScaleStruct, Empty>::value == true)
                         {
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
-                                b_thread_copy_.Run(
-                                    b_block_desc_k0_n0_n1_n2_k1,
-                                    make_tuple(
-                                        Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
-                                    b_block_buf,
-                                    b_thread_desc_,
-                                    make_tuple(I0, n0, I0, I0, I0, I0),
-                                    b_thread_buf);
+                                b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                                                   make_tuple(I0, n0, k0, I0, I0, I0, I0),
+                                                   b_block_buf,
+                                                   b_thread_desc_,
+                                                   make_tuple(I0, n0, I0, I0, I0, I0, I0),
+                                                   b_thread_buf);
                             });
                         }
                         else
@@ -239,45 +241,60 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                             static_for<0, NRepeat, 1>{}([&](auto n0) {
                                 b_thread_copy_.Run(
                                     b_block_desc_k0_n0_n1_n2_k1,
-                                    make_tuple(
-                                        Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
+                                    make_tuple(I0, n0, k0, I0, I0, I0, I0),
                                     b_block_buf,
                                     b_scale_struct.b_scale_thread_bufs(
                                         I0)[Number<n0 * BScaleStruct::num_scale_k_block +
                                                    k0 / BScaleStruct::num_scale_krepeat>{}],
                                     b_thread_desc_,
-                                    make_tuple(I0, n0, I0, I0, I0, I0),
+                                    make_tuple(I0, n0, I0, I0, I0, I0, I0),
                                     b_thread_buf);
                             });
                         }
                     }
 
-                    static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+                    static_for<0, KInner, 1>{}([&](auto k_inner) {
+                        static_for<0, NRepeat, 1>{}([&](auto n0) {
+                            vector_type<ComputeTypeA, KPack / A_KRow / KInner> a_thread_vec;
+                            vector_type<ComputeTypeB, KPack / B_KRow / KInner> b_thread_vec;
 
-                        static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
-                                    Number<ik / A_K1>{}, I0, I0, I0, I0, Number<ik % A_K1>{}))>{}];
+                            static_for<0, KPack / A_KRow / KInner, 1>{}([&](auto ik) {
+                                constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(Number<kk / A_K1>{},
+                                                   I0,
+                                                   I0,
+                                                   I0,
+                                                   I0,
+                                                   I0,
+                                                   Number<kk % A_K1>{}))>{}];
+                            });
+                            static_for<0, KPack / B_KRow / KInner, 1>{}([&](auto ik) {
+                                constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(Number<kk / B_K1>{},
+                                                   n0,
+                                                   I0,
+                                                   I0,
+                                                   I0,
+                                                   I0,
+                                                   Number<kk % B_K1>{}))>{}];
+                            });
+
+                            using wmma_input_type_a =
+                                typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                            using wmma_input_type_b =
+                                typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
+
+                            wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                          b_thread_vec.template AsType<wmma_input_type_b>(),
+                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                         });
-                        static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(make_tuple(
-                                    Number<ik / B_K1>{}, n0, I0, I0, I0, Number<ik % B_K1>{}))>{}];
-                        });
-
-                        using wmma_input_type_a =
-                            typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
-                        using wmma_input_type_b =
-                            typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
-
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
-
-                        wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
-                                      b_thread_vec.template AsType<wmma_input_type_b>(),
-                                      c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                     });
                 });
             });
@@ -324,8 +341,10 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                 __builtin_amdgcn_sched_group_barrier(0x100, 1, 0); // DS read
                             });
                         }
-                        static_for<0, NRepeat, 1>{}([&](auto) {
-                            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+                        static_for<0, KInner, 1>{}([&](auto) {
+                            static_for<0, NRepeat, 1>{}([&](auto) {
+                                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // WMMA
+                            });
                         });
                     });
                 });
@@ -348,20 +367,20 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
     protected:
     // A[MRepeat, I1, I1, KPack]
     static constexpr auto a_thread_desc_ = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<KPack / A_K1 / A_KRow>{}, I1, I1, I1, I1, Number<A_K1>{}));
+        make_tuple(Number<KPack / A_K1 / A_KRow>{}, I1, I1, I1, I1, I1, Number<A_K1>{}));
 
     // B[NRepeat, N1, N2, KPack]
-    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(
-        make_tuple(Number<KPack / B_K1 / B_KRow>{}, Number<NRepeat>{}, I1, I1, I1, Number<B_K1>{}));
+    static constexpr auto b_thread_desc_ = make_naive_tensor_descriptor_packed(make_tuple(
+        Number<KPack / B_K1 / B_KRow>{}, Number<NRepeat>{}, I1, I1, I1, I1, Number<B_K1>{}));
 
     using AThreadCopy =
         ThreadwiseTensorSliceTransfer_v4<ADataType,
                                          ComputeTypeA,
                                          decltype(a_block_desc_k0_m0_m1_m2_k1),
                                          decltype(a_thread_desc_),
-                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
-                                         Sequence<0, 1, 2, 3, 4, 5>,
-                                         5,
+                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, 1, A_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                         6,
                                          A_K1,
                                          A_K1>;
 
@@ -370,9 +389,9 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Intrawave,
                                          ComputeTypeB,
                                          decltype(b_block_desc_k0_n0_n1_n2_k1),
                                          decltype(b_thread_desc_),
-                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
-                                         Sequence<0, 1, 2, 3, 4, 5>,
-                                         5,
+                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, 1, B_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                         6,
                                          B_K1,
                                          B_K1>;
 
@@ -399,6 +418,7 @@ template <index_t BlockSize,
           index_t MRepeat,
           index_t NRepeat,
           index_t KPack,
+          index_t KInner,
           bool TransposeC>
 struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                         BlockSize,
@@ -419,6 +439,7 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                         MRepeat,
                                         NRepeat,
                                         KPack,
+                                        KInner,
                                         TransposeC>
     : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
                                          ADataType,
@@ -438,6 +459,7 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                          MRepeat,
                                          NRepeat,
                                          KPack,
+                                         KInner,
                                          TransposeC>
 {
     using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
@@ -458,6 +480,7 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                                     MRepeat,
                                                     NRepeat,
                                                     KPack,
+                                                    KInner,
                                                     TransposeC>;
     using Base::I0;
     using Base::I1;
@@ -532,6 +555,8 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                         index_t num_loop,
                         index_t num_loop_per_scale) const
     {
+        constexpr index_t KPerWaveBlock = wmma_gemm.GetKPerWaveBlk();
+
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
             a_thread_desc_.GetElementSpaceSize());
         auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
@@ -557,33 +582,22 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
             static_for<0, KRepeat, KRepeatPerCluster>{}([&](auto k0_offset) {
                 static_for<0, KRepeatPerCluster, 1>{}([&](auto k0_inner) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        a_thread_copy_.Run(
-                            a_block_desc_k0_m0_m1_m2_k1,
-                            make_tuple(Number<(k0_offset + k0_inner) * KPack / A_K1 / A_KRow>{},
-                                       m0,
-                                       I0,
-                                       I0,
-                                       I0,
-                                       I0),
-                            a_block_buf,
-                            a_thread_desc_,
-                            make_tuple(I0, m0, k0_inner, I0, I0, I0),
-                            a_thread_buf);
+                        a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                                           make_tuple(I0, m0, k0_offset + k0_inner, I0, I0, I0, I0),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(I0, m0, k0_inner, I0, I0, I0, I0),
+                                           a_thread_buf);
                     });
                     if constexpr(ck::is_same<BScaleStruct, Empty>::value == true)
                     {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
                             b_thread_copy_.Run(
                                 b_block_desc_k0_n0_n1_n2_k1,
-                                make_tuple(Number<(k0_offset + k0_inner) * KPack / B_K1 / B_KRow>{},
-                                           n0,
-                                           I0,
-                                           I0,
-                                           I0,
-                                           I0),
+                                make_tuple(I0, n0, k0_offset + k0_inner, I0, I0, I0, I0),
                                 b_block_buf,
                                 b_thread_desc_,
-                                make_tuple(I0, n0, k0_inner, I0, I0, I0),
+                                make_tuple(I0, n0, k0_inner, I0, I0, I0, I0),
                                 b_thread_buf);
                         });
                     }
@@ -592,18 +606,13 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
                             b_thread_copy_.Run(
                                 b_block_desc_k0_n0_n1_n2_k1,
-                                make_tuple(Number<(k0_offset + k0_inner) * KPack / B_K1 / B_KRow>{},
-                                           n0,
-                                           I0,
-                                           I0,
-                                           I0,
-                                           I0),
+                                make_tuple(I0, n0, k0_offset + k0_inner, I0, I0, I0, I0),
                                 b_block_buf,
                                 b_scale_struct.b_scale_thread_bufs(I0)[Number<
                                     n0 * BScaleStruct::num_scale_k_block +
                                     (k0_offset + k0_inner) / BScaleStruct::num_scale_krepeat>{}],
                                 b_thread_desc_,
-                                make_tuple(I0, n0, k0_inner, I0, I0, I0),
+                                make_tuple(I0, n0, k0_inner, I0, I0, I0, I0),
                                 b_thread_buf);
                         });
                     }
@@ -622,62 +631,69 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                     __builtin_amdgcn_sched_barrier(0);
                 }
                 static_for<0, KRepeatPerCluster, 1>{}([&](auto k0_inner) {
-                    static_for<0, MRepeat, 1>{}([&](auto m0) {
-                        static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
-                            vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+                    static_for<0, KInner, 1>{}([&](auto k_inner) {
+                        static_for<0, MRepeat, 1>{}([&](auto m0) {
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                vector_type<ComputeTypeA, KPack / A_KRow / KInner> a_thread_vec;
+                                vector_type<ComputeTypeB, KPack / B_KRow / KInner> b_thread_vec;
 
-                            static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                        make_tuple(Number<ik / A_K1>{},
-                                                   m0,
-                                                   k0_inner,
-                                                   I0,
-                                                   I0,
-                                                   Number<ik % A_K1>{}))>{}];
+                                static_for<0, KPack / A_KRow / KInner, 1>{}([&](auto ik) {
+                                    constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(Number<kk / A_K1>{},
+                                                       m0,
+                                                       k0_inner,
+                                                       I0,
+                                                       I0,
+                                                       I0,
+                                                       Number<kk % A_K1>{}))>{}];
+                                });
+                                static_for<0, KPack / B_KRow / KInner, 1>{}([&](auto ik) {
+                                    constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(Number<kk / B_K1>{},
+                                                       n0,
+                                                       k0_inner,
+                                                       I0,
+                                                       I0,
+                                                       I0,
+                                                       Number<kk % B_K1>{}))>{}];
+                                });
+
+                                using wmma_input_type_a =
+                                    typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                                using wmma_input_type_b =
+                                    typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
+
+                                // The block_sync_lds() here performs double duty:
+                                // A) safeguard against data hazard.
+                                // B) reduce VMEM FIFO congestion by applying small delays to
+                                // different wavefronts.
+                                // It is performed near the end of MAC cluster to minimize lgkmcnt
+                                // penalty
+                                if constexpr(k0_offset + k0_inner == KRepeat - 1 &&
+                                             m0 == MRepeat - 1 && n0 == NRepeat - 1)
+                                {
+                                    __builtin_amdgcn_sched_barrier(0);
+                                    block_sync_lds();
+                                    __builtin_amdgcn_sched_barrier(0);
+                                }
+                                wmma_gemm.Run(
+                                    a_thread_vec.template AsType<wmma_input_type_a>(),
+                                    b_thread_vec.template AsType<wmma_input_type_b>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                                if constexpr(k0_inner == 0 && m0 == 0 && n0 == 0)
+                                {
+                                    __builtin_amdgcn_sched_barrier(0);
+                                    __builtin_amdgcn_s_setprio(1);
+                                    __builtin_amdgcn_sched_barrier(0);
+                                }
                             });
-                            static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
-                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                        make_tuple(Number<ik / B_K1>{},
-                                                   n0,
-                                                   k0_inner,
-                                                   I0,
-                                                   I0,
-                                                   Number<ik % B_K1>{}))>{}];
-                            });
-
-                            using wmma_input_type_a =
-                                typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
-                            using wmma_input_type_b =
-                                typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
-
-                            constexpr index_t c_offset =
-                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
-
-                            // The block_sync_lds() here performs double duty:
-                            // A) safeguard against data hazard.
-                            // B) reduce VMEM FIFO congestion by applying small delays to
-                            // different wavefronts.
-                            // It is performed near the end of MAC cluster to minimize lgkmcnt
-                            // penalty
-                            if constexpr(k0_offset + k0_inner == KRepeat - 1 && m0 == MRepeat - 1 &&
-                                         n0 == NRepeat - 1)
-                            {
-                                __builtin_amdgcn_sched_barrier(0);
-                                block_sync_lds();
-                                __builtin_amdgcn_sched_barrier(0);
-                            }
-                            wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
-                                          b_thread_vec.template AsType<wmma_input_type_b>(),
-                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
-                            if constexpr(k0_inner == 0 && m0 == 0 && n0 == 0)
-                            {
-                                __builtin_amdgcn_sched_barrier(0);
-                                __builtin_amdgcn_s_setprio(1);
-                                __builtin_amdgcn_sched_barrier(0);
-                            }
                         });
                     });
                 });
@@ -729,12 +745,14 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                                 Number<KRepeatPerCluster>{},
                                                 I1,
                                                 I1,
+                                                I1,
                                                 Number<A_K1>{}),
                                      make_tuple(Number<A_K1>{},
                                                 Number<KPack / A_KRow>{},
                                                 Number<KPack / A_KRow * MRepeat>{},
                                                 I0,
                                                 I0,
+                                                I0,
                                                 I1));
 
     static constexpr auto b_thread_desc_ =
@@ -743,12 +761,14 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                                 Number<KRepeatPerCluster>{},
                                                 I1,
                                                 I1,
+                                                I1,
                                                 Number<B_K1>{}),
                                      make_tuple(Number<B_K1>{},
                                                 Number<KPack / B_KRow>{},
                                                 Number<KPack / B_KRow * NRepeat>{},
                                                 I0,
                                                 I0,
+                                                I0,
                                                 I1));
 
     using AThreadCopy =
@@ -756,9 +776,9 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                          ComputeTypeA,
                                          decltype(a_block_desc_k0_m0_m1_m2_k1),
                                          decltype(a_thread_desc_),
-                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, A_K1>,
-                                         Sequence<0, 1, 2, 3, 4, 5>,
-                                         5,
+                                         Sequence<KPack / A_K1 / A_KRow, 1, 1, 1, 1, 1, A_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                         6,
                                          A_K1,
                                          A_K1>;
 
@@ -767,9 +787,9 @@ struct BlockwiseGemmWmmaops_pipeline_v1<BlockGemmPipelineScheduler::Interwave,
                                          ComputeTypeB,
                                          decltype(b_block_desc_k0_n0_n1_n2_k1),
                                          decltype(b_thread_desc_),
-                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, B_K1>,
-                                         Sequence<0, 1, 2, 3, 4, 5>,
-                                         5,
+                                         Sequence<KPack / B_K1 / B_KRow, 1, 1, 1, 1, 1, B_K1>,
+                                         Sequence<0, 1, 2, 3, 4, 5, 6>,
+                                         6,
                                          B_K1,
                                          B_K1>;
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
index 83dadb2175..cbe13b6e00 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_wmmaops_v3.hpp
@@ -32,6 +32,7 @@ template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
           index_t MRepeat,
           index_t NRepeat,
           index_t KPack,
+          index_t KInner,
           bool TransposeC = false>
 struct BlockwiseGemmWmmaops_pipeline_v3
 {
@@ -55,6 +56,7 @@ template <index_t BlockSize,
           index_t MRepeat,
           index_t NRepeat,
           index_t KPack,
+          index_t KInner,
           bool TransposeC>
 struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                                         BlockSize,
@@ -75,6 +77,7 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                                         MRepeat,
                                         NRepeat,
                                         KPack,
+                                        KInner,
                                         TransposeC>
     : BlockwiseGemmWmmaops_pipeline_base<BlockSize,
                                          ADataType,
@@ -94,6 +97,7 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                                          MRepeat,
                                          NRepeat,
                                          KPack,
+                                         KInner,
                                          TransposeC>
 {
     using Base = BlockwiseGemmWmmaops_pipeline_base<BlockSize,
@@ -114,6 +118,7 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                                                     MRepeat,
                                                     NRepeat,
                                                     KPack,
+                                                    KInner,
                                                     TransposeC>;
     using Base::I0;
 
@@ -290,40 +295,37 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
     {
         static_for<0, KRepeat, 1>{}([&](auto k0) {
             static_for<0, MRepeat, 1>{}([&](auto m0) {
-                a_thread_copy_.Run(
-                    a_block_desc_k0_m0_m1_m2_k1,
-                    make_tuple(Number<k0 * KPack / A_K1 / A_KRow>{}, m0, I0, I0, I0, I0),
-                    a_block_buf,
-                    a_thread_desc_,
-                    make_tuple(I0, m0, k0, I0, I0, I0),
-                    a_thread_buf);
+                a_thread_copy_.Run(a_block_desc_k0_m0_m1_m2_k1,
+                                   make_tuple(I0, m0, k0, I0, I0, I0, I0),
+                                   a_block_buf,
+                                   a_thread_desc_,
+                                   make_tuple(I0, m0, k0, I0, I0, I0, I0),
+                                   a_thread_buf);
             });
 
             if constexpr(ck::is_same_v<BScaleStruct, Empty>)
             {
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    b_thread_copy_.Run(
-                        b_block_desc_k0_n0_n1_n2_k1,
-                        make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
-                        b_block_buf,
-                        b_thread_desc_,
-                        make_tuple(I0, n0, k0, I0, I0, I0),
-                        b_thread_buf);
+                    b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                                       make_tuple(I0, n0, k0, I0, I0, I0, I0),
+                                       b_block_buf,
+                                       b_thread_desc_,
+                                       make_tuple(I0, n0, k0, I0, I0, I0, I0),
+                                       b_thread_buf);
                 });
             }
             else
             {
                 static_for<0, NRepeat, 1>{}([&](auto n0) {
-                    b_thread_copy_.Run(
-                        b_block_desc_k0_n0_n1_n2_k1,
-                        make_tuple(Number<k0 * KPack / B_K1 / B_KRow>{}, n0, I0, I0, I0, I0),
-                        b_block_buf,
-                        b_scale_struct.b_scale_thread_bufs(
-                            I0)[Number<n0 * BScaleStruct::num_scale_k_block +
-                                       k0 / BScaleStruct::num_scale_krepeat>{}],
-                        b_thread_desc_,
-                        make_tuple(I0, n0, k0, I0, I0, I0),
-                        b_thread_buf);
+                    b_thread_copy_.Run(b_block_desc_k0_n0_n1_n2_k1,
+                                       make_tuple(I0, n0, k0, I0, I0, I0, I0),
+                                       b_block_buf,
+                                       b_scale_struct.b_scale_thread_bufs(
+                                           I0)[Number<n0 * BScaleStruct::num_scale_k_block +
+                                                      k0 / BScaleStruct::num_scale_krepeat>{}],
+                                       b_thread_desc_,
+                                       make_tuple(I0, n0, k0, I0, I0, I0, I0),
+                                       b_thread_buf);
                 });
             }
         });
@@ -364,6 +366,9 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                         index_t num_loop_per_scale) const
     {
         __builtin_amdgcn_sched_barrier(0);
+
+        constexpr index_t KPerWaveBlock = wmma_gemm.GetKPerWaveBlk();
+
         auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
             a_thread_desc_.GetElementSpaceSize());
         auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
@@ -424,41 +429,48 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
                 static_for<0, KRepeat, 1>{}([&](auto k0) {
                     static_for<0, MRepeat, 1>{}([&](auto m0) {
                         static_for<0, NRepeat, 1>{}([&](auto n0) {
-                            vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
-                            vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+                            static_for<0, KInner, 1>{}([&](auto k_inner) {
+                                vector_type<ComputeTypeA, KPack / A_KRow / KInner> a_thread_vec;
+                                vector_type<ComputeTypeB, KPack / B_KRow / KInner> b_thread_vec;
 
-                            static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
-                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
-                                        make_tuple(Number<ik / A_K1>{},
-                                                   m0,
-                                                   k0,
-                                                   I0,
-                                                   I0,
-                                                   Number<ik % A_K1>{}))>{}];
+                                static_for<0, KPack / A_KRow / KInner, 1>{}([&](auto ik) {
+                                    constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(Number<kk / A_K1>{},
+                                                       m0,
+                                                       k0,
+                                                       I0,
+                                                       I0,
+                                                       I0,
+                                                       Number<kk % A_K1>{}))>{}];
+                                });
+                                static_for<0, KPack / B_KRow / KInner, 1>{}([&](auto ik) {
+                                    constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                        b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(Number<kk / B_K1>{},
+                                                       n0,
+                                                       k0,
+                                                       I0,
+                                                       I0,
+                                                       I0,
+                                                       Number<kk % B_K1>{}))>{}];
+                                });
+
+                                using wmma_input_type_a =
+                                    typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                                using wmma_input_type_b =
+                                    typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                                constexpr index_t c_offset =
+                                    c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
+
+                                wmma_gemm.Run(
+                                    a_thread_vec.template AsType<wmma_input_type_a>(),
+                                    b_thread_vec.template AsType<wmma_input_type_b>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                             });
-                            static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
-                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
-                                        make_tuple(Number<ik / B_K1>{},
-                                                   n0,
-                                                   k0,
-                                                   I0,
-                                                   I0,
-                                                   Number<ik % B_K1>{}))>{}];
-                            });
-
-                            using wmma_input_type_a =
-                                typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
-                            using wmma_input_type_b =
-                                typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
-
-                            constexpr index_t c_offset =
-                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
-
-                            wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
-                                          b_thread_vec.template AsType<wmma_input_type_b>(),
-                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                         });
                     });
                 });
@@ -489,31 +501,47 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+                        static_for<0, KInner, 1>{}([&](auto k_inner) {
+                            vector_type<ComputeTypeA, KPack / A_KRow / KInner> a_thread_vec;
+                            vector_type<ComputeTypeB, KPack / B_KRow / KInner> b_thread_vec;
 
-                        static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
-                                    Number<ik / A_K1>{}, m0, k0, I0, I0, Number<ik % A_K1>{}))>{}];
+                            static_for<0, KPack / A_KRow / KInner, 1>{}([&](auto ik) {
+                                constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(Number<kk / A_K1>{},
+                                                   m0,
+                                                   k0,
+                                                   I0,
+                                                   I0,
+                                                   I0,
+                                                   Number<kk % A_K1>{}))>{}];
+                            });
+                            static_for<0, KPack / B_KRow / KInner, 1>{}([&](auto ik) {
+                                constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(Number<kk / B_K1>{},
+                                                   n0,
+                                                   k0,
+                                                   I0,
+                                                   I0,
+                                                   I0,
+                                                   Number<kk % B_K1>{}))>{}];
+                            });
+
+                            using wmma_input_type_a =
+                                typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                            using wmma_input_type_b =
+                                typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
+
+                            wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                          b_thread_vec.template AsType<wmma_input_type_b>(),
+                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                         });
-                        static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(make_tuple(
-                                    Number<ik / B_K1>{}, n0, k0, I0, I0, Number<ik % B_K1>{}))>{}];
-                        });
-
-                        using wmma_input_type_a =
-                            typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
-                        using wmma_input_type_b =
-                            typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
-
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
-
-                        wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
-                                      b_thread_vec.template AsType<wmma_input_type_b>(),
-                                      c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                     });
                 });
             });
@@ -531,31 +559,47 @@ struct BlockwiseGemmWmmaops_pipeline_v3<BlockGemmPipelineScheduler::Intrawave,
             static_for<0, KRepeat, 1>{}([&](auto k0) {
                 static_for<0, MRepeat, 1>{}([&](auto m0) {
                     static_for<0, NRepeat, 1>{}([&](auto n0) {
-                        vector_type<ComputeTypeA, KPack / A_KRow> a_thread_vec;
-                        vector_type<ComputeTypeB, KPack / B_KRow> b_thread_vec;
+                        static_for<0, KInner, 1>{}([&](auto k_inner) {
+                            vector_type<ComputeTypeA, KPack / A_KRow / KInner> a_thread_vec;
+                            vector_type<ComputeTypeB, KPack / B_KRow / KInner> b_thread_vec;
 
-                        static_for<0, KPack / A_KRow, 1>{}([&](auto ik) {
-                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
-                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(make_tuple(
-                                    Number<ik / A_K1>{}, m0, k0, I0, I0, Number<ik % A_K1>{}))>{}];
+                            static_for<0, KPack / A_KRow / KInner, 1>{}([&](auto ik) {
+                                constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                    a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                        make_tuple(Number<kk / A_K1>{},
+                                                   m0,
+                                                   k0,
+                                                   I0,
+                                                   I0,
+                                                   I0,
+                                                   Number<kk % A_K1>{}))>{}];
+                            });
+                            static_for<0, KPack / B_KRow / KInner, 1>{}([&](auto ik) {
+                                constexpr index_t kk = ik + k_inner * KPerWaveBlock;
+                                b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                    b_thread_buf[Number<b_thread_desc_.CalculateOffset(
+                                        make_tuple(Number<kk / B_K1>{},
+                                                   n0,
+                                                   k0,
+                                                   I0,
+                                                   I0,
+                                                   I0,
+                                                   Number<kk % B_K1>{}))>{}];
+                            });
+
+                            using wmma_input_type_a =
+                                typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
+                            using wmma_input_type_b =
+                                typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
+
+                            constexpr index_t c_offset =
+                                c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
+
+                            wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
+                                          b_thread_vec.template AsType<wmma_input_type_b>(),
+                                          c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                         });
-                        static_for<0, KPack / B_KRow, 1>{}([&](auto ik) {
-                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
-                                b_thread_buf[Number<b_thread_desc_.CalculateOffset(make_tuple(
-                                    Number<ik / B_K1>{}, n0, k0, I0, I0, Number<ik % B_K1>{}))>{}];
-                        });
-
-                        using wmma_input_type_a =
-                            typename vector_type<ComputeTypeA, WmmaK / A_KRow>::type;
-                        using wmma_input_type_b =
-                            typename vector_type<ComputeTypeB, WmmaK / B_KRow>::type;
-
-                        constexpr index_t c_offset =
-                            c_thread_desc_.CalculateOffset(make_tuple(m0, n0, I0));
-
-                        wmma_gemm.Run(a_thread_vec.template AsType<wmma_input_type_a>(),
-                                      b_thread_vec.template AsType<wmma_input_type_b>(),
-                                      c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
                     });
                 });
             });
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp
index b3b3d312c7..b621c3a93d 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp
@@ -727,7 +727,8 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_gufusion_v3<
                         });
                     });
 
-                    HotLoopScheduler();
+                    if constexpr(MPerBlock >= 64)
+                        HotLoopScheduler();
                     __builtin_amdgcn_sched_barrier(0);
                 };
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp
index 6789d26a45..5223993671 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_selector.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp"
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp"
 #include "ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_gufusion_v3.hpp"
 
 namespace ck {
@@ -45,7 +46,28 @@ constexpr auto BlockGemmMXBPreshufflePipeline_Selector()
         }
         else
         {
-            return nullptr;
+            return BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1<
+                BlkGemmPipeSche,
+                ThreadBlockSize,
+                ScaleBlockSize,
+                ADataType,
+                AScaleDataType,
+                BDataType,
+                BScaleDataType,
+                ATileDesc,
+                BTileDesc,
+                AMmaTileDesc,
+                BMmaTileDesc,
+                ABlockTransferSrcScalarPerVector,
+                BBlockTransferSrcScalarPerVector,
+                MPerBlock,
+                NPerBlock,
+                KPerBlock,
+                MPerXDL,
+                NPerXDL,
+                MRepeat,
+                NRepeat,
+                KPack>{};
         }
     }
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp
new file mode 100644
index 0000000000..fc5cb60c37
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v1.hpp
@@ -0,0 +1,891 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/tensor_operation/gpu/block/blockwise_gemm_mx_pipeline_xdlops_base.hpp"
+
+namespace ck {
+
+// Naive pipeline with lowest resource request per WGP
+// GlobalPrefetchStages: 2
+// LocalPreFillStages: 1
+// LocalPreFetchStages: 1
+// LocalSharedMemoryBuffer: 1
+
+template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
+          index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1
+{
+};
+
+template <index_t ThreadBlockSize,
+          index_t ScaleBlockSize,
+          typename ADataType,
+          typename AScaleDataType,
+          typename BDataType,
+          typename BScaleDataType,
+          typename ATileDesc,
+          typename BTileDesc,
+          typename AMmaTileDesc,
+          typename BMmaTileDesc,
+          index_t ABlockTransferSrcScalarPerVector,
+          index_t BBlockTransferSrcScalarPerVector,
+          index_t MPerBlock,
+          index_t NPerBlock,
+          index_t KPerBlock,
+          index_t MPerXDL,
+          index_t NPerXDL,
+          index_t MRepeat, // MXdlPerWave
+          index_t NRepeat, // NXdlPerWave
+          index_t KPack>
+struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v1<BlockGemmPipelineScheduler::Intrawave,
+                                                          ThreadBlockSize,
+                                                          ScaleBlockSize,
+                                                          ADataType,
+                                                          AScaleDataType,
+                                                          BDataType,
+                                                          BScaleDataType,
+                                                          ATileDesc,
+                                                          BTileDesc,
+                                                          AMmaTileDesc,
+                                                          BMmaTileDesc,
+                                                          ABlockTransferSrcScalarPerVector,
+                                                          BBlockTransferSrcScalarPerVector,
+                                                          MPerBlock,
+                                                          NPerBlock,
+                                                          KPerBlock,
+                                                          MPerXDL,
+                                                          NPerXDL,
+                                                          MRepeat,
+                                                          NRepeat,
+                                                          KPack>
+    : BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                           ADataType,
+                                           BDataType,
+                                           ATileDesc,
+                                           BTileDesc,
+                                           AMmaTileDesc,
+                                           BMmaTileDesc,
+                                           ABlockTransferSrcScalarPerVector,
+                                           BBlockTransferSrcScalarPerVector,
+                                           MPerBlock,
+                                           NPerBlock,
+                                           KPerBlock,
+                                           MPerXDL,
+                                           NPerXDL,
+                                           MRepeat,
+                                           NRepeat,
+                                           KPack>
+
+{
+
+    using Base = BlockwiseGemmXdlops_mx_pipeline_base<ThreadBlockSize,
+                                                      ADataType,
+                                                      BDataType,
+                                                      ATileDesc,
+                                                      BTileDesc,
+                                                      AMmaTileDesc,
+                                                      BMmaTileDesc,
+                                                      ABlockTransferSrcScalarPerVector,
+                                                      BBlockTransferSrcScalarPerVector,
+                                                      MPerBlock,
+                                                      NPerBlock,
+                                                      KPerBlock,
+                                                      MPerXDL,
+                                                      NPerXDL,
+                                                      MRepeat,
+                                                      NRepeat,
+                                                      KPack>;
+    using Base::A_K1;
+    using Base::I0;
+    using Base::I1;
+    using Base::KRepeat;
+    using Base::MWaves;
+    using Base::NWaves;
+    using Base::WaveSize;
+    using Base::xdlops_gemm;
+    using typename Base::HotLoopInstList;
+
+    using Base::CalculateCThreadOriginDataIndex;
+    using Base::GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetCThreadBuffer;
+    using Base::GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4;
+    using Base::GetWaveIdx;
+    using Base::MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2;
+    using Base::MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2;
+
+    using Base::a_block_desc_m0_m1_m2_m3_k;
+    using Base::b_block_desc_n0_n1_n2_n3_k;
+
+    using Base::AMmaKStride;
+    using Base::APackedSize;
+    using Base::BMmaKStride;
+    using Base::BPackedSize;
+    using Base::KThreadChunk;
+
+    using Base::KXdlPack;
+    using Base::MXdlPack;
+    using Base::NXdlPack;
+
+    using AccType      = typename Base::AccType;
+    using Tuple5       = typename Base::Tuple5;
+    using ComputeTypeA = typename Base::ComputeTypeA;
+    using ComputeTypeB = typename Base::ComputeTypeB;
+
+    static constexpr index_t PrefetchStages        = 2;
+    static constexpr index_t PrefillStages         = 1;
+    static constexpr index_t GlobalBufferNum       = 2;
+    static constexpr index_t HotloopLocalBufSwitch = MRepeat % 2 == 0 ? 0 : 1;
+
+    static constexpr auto num_buffer_load_a_scale = MRepeat / MXdlPack * KRepeat / KXdlPack;
+    static constexpr auto num_buffer_load_b_scale = NRepeat / NXdlPack * KRepeat / KXdlPack;
+    static constexpr auto async_vmcnt =
+        num_buffer_load_a_scale + num_buffer_load_b_scale + HotLoopInstList::B_Buffer_Load_Inst_Num;
+    static constexpr auto async_vmcnt_encoding = 3952 + async_vmcnt % 16 + async_vmcnt / 16 * 16384;
+
+    static constexpr auto ScalesPerKBlockSize =
+        KPerBlock / ScaleBlockSize; // How many mx-vectors per K block
+
+    //> How many mx-vectors in each row/col is processed in one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRun =
+        (APackedSize * KPack * xdlops_gemm.K0PerXdlops) / ScaleBlockSize;
+
+    //> How many scales a thread must read to accommodate one call to xdlops_gemm.Run()
+    static constexpr auto ScalesPerXdlopsRunPerThread =
+        ScalesPerXdlopsRun / xdlops_gemm.mfma_instr.num_input_blks;
+
+    using mx_scale_t                        = e8m0_bexp_t;
+    static constexpr auto scale_pack_size_a = sizeof(AScaleDataType) / sizeof(mx_scale_t);
+    static constexpr auto scale_pack_size_b = sizeof(BScaleDataType) / sizeof(mx_scale_t);
+    static_assert(KXdlPack * MXdlPack % scale_pack_size_a == 0,
+                  "A scale pack data type too large!");
+    static_assert(KXdlPack * NXdlPack % scale_pack_size_b == 0,
+                  "B scale pack data type too large!");
+    static constexpr auto a_scale_thread_vec_size = KXdlPack * MXdlPack / scale_pack_size_a;
+    static constexpr auto b_scale_thread_vec_size = KXdlPack * NXdlPack / scale_pack_size_b;
+
+    __host__ static constexpr bool BlockHasHotloop(index_t num_loop)
+    {
+        return num_loop > PrefetchStages;
+    }
+
+    __host__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
+    {
+        return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
+    }
+
+    __device__ static constexpr auto HotLoopScheduler()
+    {
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
+        constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves +
+                                                num_buffer_load_a_scale + num_buffer_load_b_scale;
+        constexpr auto mfma_interleave = MPerXDL == 32 ? 1 : 2;
+        // B global
+        static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+            ignore = i;
+            if constexpr(MPerBlock >= 128 && NPerBlock >= 128)
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, 2 * mfma_interleave, 0);
+            }
+            else
+            {
+                __builtin_amdgcn_sched_group_barrier(0x008, mfma_interleave, 0);
+            }
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A global
+        static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+            ignore = i;
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
+            __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+        });
+
+        // A local
+        static_for<0, MPerXDL == 32 ? num_ds_read_inst_a / 2 : num_ds_read_inst_a, 1>{}(
+            [&](auto i) {
+                ignore = i;
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0);                     // MFMA
+                __builtin_amdgcn_sched_group_barrier(0x100, MPerXDL == 32 ? 2 : 1, 0); // DS read
+            });
+    }
+
+    template <bool HasMainLoop,
+              TailNumber TailNum,
+              typename AGridDesc,
+              typename ABlockDesc,
+              typename ABlockTransfer,
+              typename AGridBuffer,
+              typename ABlockBuffer,
+              typename ABlockTransferStep,
+              typename BGridDesc,
+              typename BBlockDesc,
+              typename BBlockTransfer,
+              typename BGridBuffer,
+              typename BBlockBuffer,
+              typename BBlockTransferStep,
+              typename CThreadBuffer,
+              typename AScaleGridBuffer,
+              typename AScaleGridDesc,
+              typename AScaleThreadTransfer,
+              typename BScaleGridBuffer,
+              typename BScaleGridDesc,
+              typename BScaleThreadTransfer>
+    __device__ void Run(
+        // ABlockCopy
+        const AGridDesc& a_grid_desc,
+        const ABlockDesc& a_block_desc,
+        ABlockTransfer& a_blockwise_copy,
+        const AGridBuffer& a_grid_buf,
+        ABlockBuffer& a_block_buf,
+        const ABlockTransferStep& a_block_copy_step,
+        // BBlockCopy
+        const BGridDesc& b_grid_desc,
+        const BBlockDesc& b_block_desc,
+        BBlockTransfer& b_blockwise_copy,
+        const BGridBuffer& b_grid_buf,
+        BBlockBuffer& b_block_bufs,
+        const BBlockTransferStep& b_block_copy_step,
+        // CThread
+        CThreadBuffer& c_thread_buf,
+        // A and B scales
+        const AScaleGridDesc& a_scale_grid_desc,
+        AScaleThreadTransfer& a_scale_thread_copy,
+        const AScaleGridBuffer& a_scale_grid_buf,
+        const BScaleGridDesc& b_scale_grid_desc,
+        BScaleThreadTransfer& b_scale_thread_copy,
+        const BScaleGridBuffer& b_scale_grid_buf,
+        index_t num_loop) const
+    {
+        ignore = b_block_bufs;
+        __builtin_amdgcn_sched_barrier(0);
+        auto a_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeA>(
+            a_thread_desc_.GetElementSpaceSize());
+        auto b_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, ComputeTypeB>(
+            b_thread_desc_.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
+        constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0, I0);
+
+        auto a_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, AScaleDataType>(
+            a_scale_thread_desc.GetElementSpaceSize());
+        auto b_scale_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BScaleDataType>(
+            b_scale_thread_desc.GetElementSpaceSize());
+
+        StaticallyIndexedArray<decltype(a_scale_thread_buf), Number<2>{}> a_scale_thread_bufs;
+        StaticallyIndexedArray<decltype(b_scale_thread_buf), Number<2>{}> b_scale_thread_bufs;
+
+        // Global prefetch 1
+        a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
+        b_blockwise_copy.Run(
+            b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(I0));
+
+        a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+        b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+        __builtin_amdgcn_sched_barrier(0);
+
+        // Prefetch a_scales
+        static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                a_scale_thread_copy.Run(a_scale_grid_desc,
+                                        a_scale_grid_buf,
+                                        a_scale_thread_desc,
+                                        make_tuple(m0, k0, I0),
+                                        a_scale_thread_bufs(I0));
+
+                a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            a_scale_thread_copy.MoveSrcSliceWindow(
+                a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore row id and advance to the next set of scales
+        a_scale_thread_copy.MoveSrcSliceWindow(
+            a_scale_grid_desc,
+            make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+        // Prefetch b_scales
+        static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+            static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                b_scale_thread_copy.Run(b_scale_grid_desc,
+                                        b_scale_grid_buf,
+                                        b_scale_thread_desc,
+                                        make_tuple(n0, k0, I0),
+                                        b_scale_thread_bufs(I0));
+
+                b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                       make_multi_index(0, I1, 0));
+            });
+            b_scale_thread_copy.MoveSrcSliceWindow(
+                b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+        });
+
+        // restore col id and advance to the next set of scales
+        // NWaves * NPerXDL * NRepeat == NPerBlock
+        b_scale_thread_copy.MoveSrcSliceWindow(
+            b_scale_grid_desc,
+            make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+        // Local prefetch 1, sync the async load
+        __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+        block_sync_lds();
+        static_for<0, MRepeat, 1>{}([&](auto m0) {
+            static_for<0, KRepeat, 1>{}([&](auto k) {
+                constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                        (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                    [&](auto chunk) {
+                        constexpr auto a_k_step_chunk =
+                            k_step + chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                        a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      I0,
+                                                      Number<a_k_step_chunk>{}),
+                                           a_block_buf,
+                                           a_thread_desc_,
+                                           make_tuple(Number<m0 / MXdlPack>{},
+                                                      I0,
+                                                      Number<m0 % MXdlPack>{},
+                                                      k,
+                                                      Number<chunk * KThreadChunk>{}),
+                                           a_thread_buf);
+                    });
+            });
+        });
+
+        // Initialize C
+        c_thread_buf.Clear();
+        __builtin_amdgcn_sched_barrier(0);
+        // main body
+        if constexpr(HasMainLoop)
+        {
+            // loop over k with the step KPerBlock
+            index_t i = 0;
+            do
+            {
+                auto LoopFunc = [&](auto scale_comp_buf, auto scale_mem_buf) {
+                    b_blockwise_copy.Run(b_grid_desc,
+                                         b_grid_buf,
+                                         b_block_desc,
+                                         b_block_origin_idx,
+                                         b_thread_bufs(scale_mem_buf));
+
+                    block_sync_lds();
+                    a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
+                    // Prefetch a_scales
+                    static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            a_scale_thread_copy.Run(a_scale_grid_desc,
+                                                    a_scale_grid_buf,
+                                                    a_scale_thread_desc,
+                                                    make_tuple(m0, k0, I0),
+                                                    a_scale_thread_bufs(scale_mem_buf));
+
+                            a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        a_scale_thread_copy.MoveSrcSliceWindow(
+                            a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore row id and advance to the next set of scales
+                    a_scale_thread_copy.MoveSrcSliceWindow(
+                        a_scale_grid_desc,
+                        make_multi_index(-MWaves * MRepeat / MXdlPack, KRepeat / KXdlPack, 0));
+
+                    // Prefetch b_scales
+                    static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                        static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                            b_scale_thread_copy.Run(b_scale_grid_desc,
+                                                    b_scale_grid_buf,
+                                                    b_scale_thread_desc,
+                                                    make_tuple(n0, k0, I0),
+                                                    b_scale_thread_bufs(scale_mem_buf));
+
+                            b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                                   make_multi_index(0, I1, 0));
+                        });
+                        b_scale_thread_copy.MoveSrcSliceWindow(
+                            b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+                    });
+
+                    // restore col id and advance to the next set of scales
+                    // NWaves * NPerXDL * NRepeat == NPerBlock
+                    b_scale_thread_copy.MoveSrcSliceWindow(
+                        b_scale_grid_desc,
+                        make_multi_index(-NWaves * NRepeat / NXdlPack, KRepeat / KXdlPack, 0));
+
+                    a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
+                    b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        constexpr auto im_major = m0 / MXdlPack;
+                        constexpr auto im_minor = m0 % MXdlPack;
+                        static_for<0, KRepeat, 1>{}([&](auto k0) {
+                            constexpr auto ik_major = k0 / KXdlPack;
+                            constexpr auto ik_minor = k0 % KXdlPack;
+                            static_for<0, NRepeat, 1>{}([&](auto n0) {
+                                constexpr auto in_major = n0 / NXdlPack;
+                                constexpr auto in_minor = n0 % NXdlPack;
+
+                                constexpr index_t a_scale_offset =
+                                    a_scale_thread_desc.CalculateOffset(
+                                        make_tuple(im_major, ik_major, I0));
+                                constexpr index_t b_scale_offset =
+                                    b_scale_thread_desc.CalculateOffset(
+                                        make_tuple(in_major, ik_major, I0));
+
+                                static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                              "Must have at least one scale per Xdlops "
+                                              "per Thread.");
+
+                                vector_type<AScaleDataType, a_scale_thread_vec_size>
+                                    a_scale_thread_vec;
+                                vector_type<BScaleDataType, b_scale_thread_vec_size>
+                                    b_scale_thread_vec;
+
+                                // Pack scale_thread_buf into scale_thread_vec
+                                static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                        a_scale_thread_bufs(
+                                            scale_comp_buf)[Number<a_scale_offset + s>{}];
+                                });
+
+                                static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                                    b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                        b_scale_thread_bufs(
+                                            scale_comp_buf)[Number<b_scale_offset + s>{}];
+                                });
+
+                                vector_type<ComputeTypeA, KPack> a_thread_vec;
+                                vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                                static_for<0, KPack, 1>{}([&](auto ik) {
+                                    a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                        a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                            make_tuple(im_major, I0, im_minor, k0, ik))>{}];
+                                    b_thread_vec.template AsType<ComputeTypeB>()(ik) = b_thread_bufs
+                                        [scale_comp_buf][Number<b_thread_desc_.CalculateOffset(
+                                            make_tuple(in_major, I0, in_minor, k0, ik))>{}];
+                                });
+
+                                using mfma_input_type_a =
+                                    typename vector_type<ComputeTypeA,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             APackedSize>::type;
+
+                                using mfma_input_type_b =
+                                    typename vector_type<ComputeTypeB,
+                                                         xdlops_gemm.K1PerXdlops /
+                                                             BPackedSize>::type;
+
+                                using mfma_scale_input_type_a =
+                                    typename vector_type<AScaleDataType,
+                                                         a_scale_thread_vec_size>::type;
+                                using mfma_scale_input_type_b =
+                                    typename vector_type<BScaleDataType,
+                                                         b_scale_thread_vec_size>::type;
+
+                                constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                                    make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                                // MFMA accumulation
+                                xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                         ik_minor * NXdlPack + in_minor>(
+                                    a_thread_vec.template AsType<mfma_input_type_a>(),
+                                    a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                                    b_thread_vec.template AsType<mfma_input_type_b>(),
+                                    b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                                    c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                            });
+                        });
+                    });
+
+                    block_sync_lds();
+
+                    static_for<0, MRepeat, 1>{}([&](auto m0) {
+                        static_for<0, KRepeat, 1>{}([&](auto k) {
+                            constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                                    (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                            static_for<0,
+                                       xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk),
+                                       1>{}([&](auto chunk) {
+                                constexpr auto a_k_step_chunk =
+                                    k_step +
+                                    chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                                a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              I0,
+                                                              Number<a_k_step_chunk>{}),
+                                                   a_block_buf,
+                                                   a_thread_desc_,
+                                                   make_tuple(Number<m0 / MXdlPack>{},
+                                                              I0,
+                                                              Number<m0 % MXdlPack>{},
+                                                              k,
+                                                              Number<chunk * KThreadChunk>{}),
+                                                   a_thread_buf);
+                            });
+                        });
+                    });
+                    HotLoopScheduler();
+                    __builtin_amdgcn_sched_barrier(0);
+                };
+
+                LoopFunc(I0, I1);
+                LoopFunc(I1, I0);
+
+                i += 2;
+            } while(i < (num_loop - 2));
+        }
+
+        // tail
+        if constexpr(TailNum == TailNumber::Even)
+        {
+            b_blockwise_copy.Run(
+                b_grid_desc, b_grid_buf, b_block_desc, b_block_origin_idx, b_thread_bufs(I1));
+
+            block_sync_lds();
+            a_blockwise_copy.Run(a_grid_desc, a_grid_buf, a_block_desc, a_block_buf);
+            // Prefetch a_scales
+            static_for<0, MRepeat / MXdlPack, 1>{}([&](auto m0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    a_scale_thread_copy.Run(a_scale_grid_desc,
+                                            a_scale_grid_buf,
+                                            a_scale_thread_desc,
+                                            make_tuple(m0, k0, I0),
+                                            a_scale_thread_bufs(I1));
+
+                    a_scale_thread_copy.MoveSrcSliceWindow(a_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                a_scale_thread_copy.MoveSrcSliceWindow(
+                    a_scale_grid_desc, make_multi_index(MWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            // Prefetch b_scales
+            static_for<0, NRepeat / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, KRepeat / KXdlPack, 1>{}([&](auto k0) {
+                    b_scale_thread_copy.Run(b_scale_grid_desc,
+                                            b_scale_grid_buf,
+                                            b_scale_thread_desc,
+                                            make_tuple(n0, k0, I0),
+                                            b_scale_thread_bufs(I1));
+
+                    b_scale_thread_copy.MoveSrcSliceWindow(b_scale_grid_desc,
+                                                           make_multi_index(0, I1, 0));
+                });
+                b_scale_thread_copy.MoveSrcSliceWindow(
+                    b_scale_grid_desc, make_multi_index(NWaves, -KRepeat / KXdlPack, 0));
+            });
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(im_major, I0, im_minor, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
+
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+
+                // constexpr auto lds_buf = m0.value >= SwitchM ? I1 : I0;
+            });
+            __builtin_amdgcn_s_waitcnt(async_vmcnt_encoding);
+            block_sync_lds();
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                static_for<0, KRepeat, 1>{}([&](auto k) {
+                    constexpr auto k_step = k * xdlops_gemm.KPerXdlops / APackedSize *
+                                            (APackedSize * KPack / xdlops_gemm.K1PerXdlops);
+                    static_for<0, xdlops_gemm.K1PerXdlops / (APackedSize * KThreadChunk), 1>{}(
+                        [&](auto chunk) {
+                            constexpr auto a_k_step_chunk =
+                                k_step +
+                                chunk * KThreadChunk * xdlops_gemm.mfma_instr.num_input_blks;
+                            a_thread_copy_.Run(a_block_desc_m0_m1_m2_m3_k,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          I0,
+                                                          Number<a_k_step_chunk>{}),
+                                               a_block_buf,
+                                               a_thread_desc_,
+                                               make_tuple(Number<m0 / MXdlPack>{},
+                                                          I0,
+                                                          Number<m0 % MXdlPack>{},
+                                                          k,
+                                                          Number<chunk * KThreadChunk>{}),
+                                               a_thread_buf);
+                        });
+                });
+            });
+            __builtin_amdgcn_sched_barrier(0);
+
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I1)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I1)[Number<b_scale_offset + s>{}];
+                        });
+
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(im_major, I0, im_minor, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I1][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
+
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+        else if constexpr(TailNum == TailNumber::Odd)
+        {
+            static_for<0, MRepeat, 1>{}([&](auto m0) {
+                constexpr auto im_major = m0 / MXdlPack;
+                constexpr auto im_minor = m0 % MXdlPack;
+                static_for<0, KRepeat, 1>{}([&](auto k0) {
+                    constexpr auto ik_major = k0 / KXdlPack;
+                    constexpr auto ik_minor = k0 % KXdlPack;
+                    static_for<0, NRepeat, 1>{}([&](auto n0) {
+                        constexpr auto in_major = n0 / NXdlPack;
+                        constexpr auto in_minor = n0 % NXdlPack;
+
+                        constexpr index_t a_scale_offset =
+                            a_scale_thread_desc.CalculateOffset(make_tuple(im_major, ik_major, I0));
+                        constexpr index_t b_scale_offset =
+                            b_scale_thread_desc.CalculateOffset(make_tuple(in_major, ik_major, I0));
+
+                        static_assert(0 < ScalesPerXdlopsRunPerThread,
+                                      "Must have at least one scale per Xdlops "
+                                      "per Thread.");
+
+                        vector_type<AScaleDataType, a_scale_thread_vec_size> a_scale_thread_vec;
+                        vector_type<BScaleDataType, b_scale_thread_vec_size> b_scale_thread_vec;
+
+                        // Pack scale_thread_buf into scale_thread_vec
+                        static_for<0, a_scale_thread_vec_size, 1>{}([&](auto s) {
+                            a_scale_thread_vec.template AsType<AScaleDataType>()(s) =
+                                a_scale_thread_bufs(I0)[Number<a_scale_offset + s>{}];
+                        });
+
+                        static_for<0, b_scale_thread_vec_size, 1>{}([&](auto s) {
+                            b_scale_thread_vec.template AsType<BScaleDataType>()(s) =
+                                b_scale_thread_bufs(I0)[Number<b_scale_offset + s>{}];
+                        });
+
+                        vector_type<ComputeTypeA, KPack> a_thread_vec;
+                        vector_type<ComputeTypeB, KPack> b_thread_vec;
+
+                        static_for<0, KPack, 1>{}([&](auto ik) {
+                            a_thread_vec.template AsType<ComputeTypeA>()(ik) =
+                                a_thread_buf[Number<a_thread_desc_.CalculateOffset(
+                                    make_tuple(im_major, I0, im_minor, k0, ik))>{}];
+                            b_thread_vec.template AsType<ComputeTypeB>()(ik) =
+                                b_thread_bufs[I0][Number<b_thread_desc_.CalculateOffset(
+                                    make_tuple(in_major, I0, in_minor, k0, ik))>{}];
+                        });
+
+                        using mfma_input_type_a =
+                            typename vector_type<ComputeTypeA,
+                                                 xdlops_gemm.K1PerXdlops / APackedSize>::type;
+
+                        using mfma_input_type_b =
+                            typename vector_type<ComputeTypeB,
+                                                 xdlops_gemm.K1PerXdlops / BPackedSize>::type;
+
+                        using mfma_scale_input_type_a =
+                            typename vector_type<AScaleDataType, a_scale_thread_vec_size>::type;
+                        using mfma_scale_input_type_b =
+                            typename vector_type<BScaleDataType, b_scale_thread_vec_size>::type;
+
+                        constexpr index_t c_offset = c_thread_desc_.CalculateOffset(
+                            make_tuple(im_major, in_major, im_minor, in_minor, 0));
+
+                        // MFMA accumulation
+                        xdlops_gemm.template Run<ik_minor * MXdlPack + im_minor,
+                                                 ik_minor * NXdlPack + in_minor>(
+                            a_thread_vec.template AsType<mfma_input_type_a>(),
+                            a_scale_thread_vec.template AsType<mfma_scale_input_type_a>(),
+                            b_thread_vec.template AsType<mfma_input_type_b>(),
+                            b_scale_thread_vec.template AsType<mfma_scale_input_type_b>(),
+                            c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
+                    });
+                });
+            });
+        }
+    }
+
+    // TODO: make this field protected when a_scale_thread_copy_ is moved
+    // here
+    static constexpr auto a_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<MRepeat / MXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * a_scale_thread_vec_size>{}));
+
+    // TODO: make this field protected when b_scale_thread_copy_ is moved
+    // here
+    static constexpr auto b_scale_thread_desc = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<NRepeat / NXdlPack>{},
+                   Number<KRepeat / KXdlPack>{},
+                   Number<ScalesPerXdlopsRunPerThread * b_scale_thread_vec_size>{}));
+
+    protected:
+    using Base::a_thread_copy_;
+    using Base::a_thread_desc_;
+    using Base::b_thread_copy_;
+    using Base::b_thread_desc_;
+    using Base::c_thread_desc_;
+};
+
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp
index 2b936c8d25..7473d2f2e7 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_mx_moe_v3.hpp
@@ -226,85 +226,197 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_mx_moe_v3<BlockGemmPipelineSched
         // constexpr auto num_dsread_a_mfma =
         //     (num_ds_read_inst_a + ds_read_a_mfma_rate - 1) / ds_read_a_mfma_rate;
 
-        constexpr auto num_total_stages = MRepeat;
+        constexpr auto num_total_stages = std::max(2, MRepeat);
+        if constexpr(num_total_stages > 2)
+        {
 
-        // Group num_mfma_perstage num_ds_read_a_perstage
-        // since we want to reuse a local register buffer
-        constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
-        constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
+            // Group num_mfma_perstage num_ds_read_a_perstage
+            // since we want to reuse a local register buffer
+            constexpr auto num_mfma_perstage      = num_mfma_inst / num_total_stages;
+            constexpr auto num_ds_read_a_perstage = num_ds_read_inst_a / num_total_stages;
 
-        constexpr auto num_ds_read_a_mfma_perstage =
-            math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
+            constexpr auto num_ds_read_a_mfma_perstage =
+                math::integer_divide_ceil(num_ds_read_a_perstage, ds_read_a_mfma_rate);
 
-        constexpr auto num_ds_read_a_prefetch_stages = 2;
+            constexpr auto num_ds_read_a_prefetch_stages = 2;
 
-        constexpr auto buffer_load_perstage_more =
-            math::integer_divide_ceil((num_buffer_load_stage1), (num_total_stages - 2));
-        constexpr auto buffer_load_perstage_less =
-            math::integer_divide_floor((num_buffer_load_stage1), (num_total_stages - 2));
-        constexpr auto buffer_load_perstage_stage2 =
-            math::integer_divide_floor((num_buffer_load_stage2), 2);
+            constexpr auto buffer_load_perstage_more =
+                math::integer_divide_ceil((num_buffer_load_stage1), (num_total_stages - 2));
+            constexpr auto buffer_load_perstage_less =
+                math::integer_divide_floor((num_buffer_load_stage1), (num_total_stages - 2));
+            constexpr auto buffer_load_perstage_stage2 =
+                math::integer_divide_floor((num_buffer_load_stage2), 2);
 
-        constexpr auto buffer_load_stages_more =
-            num_buffer_load_stage1 -
-            math::integer_divide_floor(num_buffer_load_stage1, (num_total_stages - 2)) *
-                ((num_total_stages - 2));
+            constexpr auto buffer_load_stages_more =
+                num_buffer_load_stage1 -
+                math::integer_divide_floor(num_buffer_load_stage1, (num_total_stages - 2)) *
+                    ((num_total_stages - 2));
 
-        constexpr auto buffer_load_issue_point_interval_more =
-            num_mfma_perstage / buffer_load_perstage_more;
-        constexpr auto buffer_load_issue_point_interval_less =
-            num_mfma_perstage / buffer_load_perstage_less;
-        constexpr auto buffer_load_issue_point_interval_stage2 =
-            num_mfma_perstage / buffer_load_perstage_stage2;
+            constexpr auto buffer_load_issue_point_interval_more =
+                num_mfma_perstage / buffer_load_perstage_more;
+            constexpr auto buffer_load_issue_point_interval_less =
+                num_mfma_perstage / buffer_load_perstage_less;
+            constexpr auto buffer_load_issue_point_interval_stage2 =
+                num_mfma_perstage / buffer_load_perstage_stage2;
 
-        // Stage 1
-        // global read more
-        static_for<0, buffer_load_stages_more, 1>{}([&](auto /*i*/) {
-            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+            // Stage 1
+            // global read more
+            static_for<0, buffer_load_stages_more, 1>{}([&](auto /*i*/) {
+                static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
 
-                if constexpr(imfma % buffer_load_issue_point_interval_more == 0)
+                    if constexpr(imfma % buffer_load_issue_point_interval_more == 0)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+
+                    if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x100, ds_read_a_mfma_rate, 0); // DS read
+                    }
+                });
+            });
+
+            // global read less
+            static_for<0, (num_total_stages - 2 - buffer_load_stages_more), 1>{}([&](auto /*i*/) {
+                static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma % buffer_load_issue_point_interval_less == 0)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+                    if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x100, ds_read_a_mfma_rate, 0); // DS read
+                    }
+                });
+            });
+
+            // Stage 2, Sync
+            // lds synchronization, prefetch next loop local A
+            static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto /*i*/) {
+                static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
+                    __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    if constexpr(imfma % buffer_load_issue_point_interval_stage2 == 0)
+                    {
+                        __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                    }
+                    if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+                    {
+                        __builtin_amdgcn_sched_group_barrier(
+                            0x100, ds_read_a_mfma_rate, 0); // DS read
+                    }
+                });
+            });
+        }
+        else
+        {
+            constexpr auto num_buffer_load_total = num_buffer_load_inst_a + num_buffer_load_inst_b +
+                                                   num_buffer_load_a_scale +
+                                                   num_buffer_load_b_scale;
+            constexpr auto num_dsread_a_mfma = math::integer_divide_ceil(
+                num_ds_read_inst_a, ds_read_a_mfma_rate); // how many mfma per dsread_a
+
+            // stage 1
+            constexpr auto num_mfma_stage1 = num_mfma_inst - num_dsread_a_mfma;
+
+            constexpr auto mfma_perstage_more =
+                math::integer_divide_ceil(num_mfma_stage1, num_buffer_load_total);
+            constexpr auto mfma_perstage_less =
+                math::integer_divide_floor(num_mfma_stage1, num_buffer_load_total);
+
+            constexpr auto mfma_stages_more =
+                num_mfma_stage1 - mfma_perstage_less * num_buffer_load_total;
+
+            static_for<0, num_buffer_load_inst_a, 1>{}([&](auto i) {
+                if constexpr(i < mfma_stages_more)
                 {
+                    static_for<0, mfma_perstage_more, 1>{}([&](auto) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
                     __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
                 }
+                else
+                {
+                    static_for<0, mfma_perstage_less, 1>{}([&](auto) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+            });
 
-                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
+            static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
+                if constexpr((i + num_buffer_load_inst_a) < mfma_stages_more)
+                {
+                    static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                else
+                {
+                    static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+            });
+
+            static_for<0, num_buffer_load_a_scale, 1>{}([&](auto i) {
+                if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b) <
+                             mfma_stages_more)
+                {
+                    static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                else
+                {
+                    static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+            });
+
+            static_for<0, num_buffer_load_b_scale, 1>{}([&](auto i) {
+                if constexpr((i + num_buffer_load_inst_a + num_buffer_load_inst_b +
+                              num_buffer_load_a_scale) < mfma_stages_more)
+                {
+                    static_for<0, mfma_perstage_more, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+                else
+                {
+                    static_for<0, mfma_perstage_less, 1>{}([&](auto /*imfma*/) {
+                        __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                    });
+                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
+                }
+            });
+
+            // stage 2
+            static_for<0, num_dsread_a_mfma, 1>{}([&](auto i) {
+                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
+                if constexpr((num_ds_read_inst_a - (i + 1) * ds_read_a_mfma_rate) >=
+                             ds_read_a_mfma_rate)
                 {
                     __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
                 }
-            });
-        });
-
-        // global read less
-        static_for<0, (num_total_stages - 2 - buffer_load_stages_more), 1>{}([&](auto /*i*/) {
-            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                if constexpr(imfma % buffer_load_issue_point_interval_less == 0)
+                else
                 {
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                }
-                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
-                {
-                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
+                    __builtin_amdgcn_sched_group_barrier(
+                        0x100,
+                        num_ds_read_inst_a - (num_dsread_a_mfma - 1) * ds_read_a_mfma_rate,
+                        0); // DS read
                 }
             });
-        });
-
-        // Stage 2, Sync
-        // lds synchronization, prefetch next loop local A
-        static_for<0, num_ds_read_a_prefetch_stages, 1>{}([&](auto /*i*/) {
-            static_for<0, num_mfma_perstage, 1>{}([&](auto imfma) {
-                __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
-                if constexpr(imfma % buffer_load_issue_point_interval_stage2 == 0)
-                {
-                    __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
-                }
-                if constexpr(imfma >= (num_mfma_perstage - num_ds_read_a_mfma_perstage))
-                {
-                    __builtin_amdgcn_sched_group_barrier(0x100, ds_read_a_mfma_rate, 0); // DS read
-                }
-            });
-        });
+        }
     }
 
     template <bool HasMainLoop,
diff --git a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
index 01bb806789..219206c5ce 100644
--- a/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp
@@ -3,6 +3,8 @@
 
 #pragma once
 
+#include <string>
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index c7ee3e9ecf..650c6f11d3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -1,5 +1,5 @@
+// Copyright (C) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -24,6 +24,10 @@
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 
+#ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp"
+#endif
+
 namespace ck {
 namespace tensor_operation {
 namespace device {
@@ -1225,6 +1229,19 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         return str.str();
     }
 
+#ifdef CK_EXPERIMENTAL_BUILDER
+    std::string GetInstanceString() const override
+    {
+        static_assert(ck_tile::reflect::HasInstanceTraits<DeviceOp>,
+                      "Specialization of instance_traits not found. Please check that a "
+                      "specialization exists in file "
+                      "ck_tile/builder/reflect/"
+                      "instance_traits_device_grouped_conv_bwd_weight_xdl_cshuffle.hpp "
+                      "for the given template parameters.");
+        return ck_tile::reflect::instance_string<DeviceOp>();
+    }
+#endif
+
     size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
     {
         auto arg = dynamic_cast<const Argument*>(p_arg);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
index 214b78c38b..3841c0fe0c 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_moe_mx_gemm_bpreshuffle.hpp
@@ -122,7 +122,7 @@ struct DeviceMoeGemmMXBPreShuffle : public DeviceMoEGemmMXBPreShuffle<ALayout,
         MPerXDL,
         NPerXDL,
         MXdlPerWave,
-        NXdlPerWave_,
+        math::max(2, NXdlPerWave_),
         ABlockTransferThreadClusterLengths_AK0_M_AK1,
         ABlockTransferThreadClusterArrangeOrder,
         ABlockTransferSrcAccessOrder,
diff --git a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
index 6f24bf3df8..1bb0b63792 100644
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -73,7 +73,7 @@ struct AddReluAdd
     __host__ __device__ constexpr void operator()<half_t, float, half_t, half_t>(
         half_t& y, const float& x0, const half_t& x1, const half_t& x2) const
     {
-        float y_float;
+        float y_float = 0.0;
         (*this)(y_float, x0, x1, x2);
         y = y_float;
     }
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_ab_transfer_thread_tiles.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_ab_transfer_thread_tiles.hpp
index 465952e285..23f16d38e9 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_ab_transfer_thread_tiles.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_ab_transfer_thread_tiles.hpp
@@ -17,6 +17,9 @@ template <typename ABLayout,
           index_t KPerBlock,
           index_t MNPerWmma,
           index_t ABK1Value,
+          index_t KPack,
+          index_t KInner,
+          index_t KPerWmmaBlk,
           bool UseBlockPaddingAB,
           bool PermuteAB,
           typename ABBlockTransferThreadClusterLengths_ABK0_MN_ABK1,
@@ -374,14 +377,93 @@ struct ABTransferThreadTiles
 #else
         constexpr auto KRow = I1;
 #endif
-        return transform_tensor_descriptor(
-            BlockDesc{},
-            make_tuple(make_unmerge_transform(make_tuple(Number<ABK0 / KRow>{}, KRow)),
-                       make_unmerge_transform(
-                           make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
-                       make_pass_through_transform(Number<ABK1>{})),
-            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-            make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
+        if constexpr(KInner > 1)
+        {
+            // KPack = KInner * KPerWmma
+            // K1 = KInner * KPerWmmaBlk
+            // Each thread loads multiple tiles with one instruction
+            // 1 - MNRepeat - K0 / KRow - MNWaves - KRow - MNPerWmma - K1
+            return transform_tensor_descriptor(
+                BlockDesc{},
+                make_tuple(
+                    make_unmerge_transform(make_tuple(Number<ABK0 / KRow>{}, KRow, Number<1>{})),
+                    make_unmerge_transform(
+                        make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
+                    make_pass_through_transform(Number<ABK1>{})),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<2, 4, 0>{}, Sequence<1, 3, 5>{}, Sequence<6>{}));
+        }
+        else
+        {
+            // KPack = KPerWmma (KInner == 1)
+            if constexpr(ABK1 <= KPerWmmaBlk)
+            {
+                // K1 <= single tile (KPerWmmaBlk)
+                // Each thread will load KPerWmmaBlk for the WMMA instruction
+                // Since K1 <= single tile, K0 is unmerged first over KPack / KRow / K1
+                // (rest of the single WMMA tile for single thread) and then over KRow
+                // (rest of the single WMMA tile for single wave)
+                // KPack / KRow / K1 - MNRepeat - K0 / KRow - MNWaves - KRow - MNPerWmma - K1
+                return transform_tensor_descriptor(
+                    BlockDesc{},
+                    make_tuple(
+                        make_unmerge_transform(make_tuple(
+                            Number<ABK0 / (KPack / ABK1)>{}, KRow, Number<KPack / KRow / ABK1>{})),
+                        make_unmerge_transform(
+                            make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
+                        make_pass_through_transform(Number<ABK1>{})),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<2, 4, 0>{}, Sequence<1, 3, 5>{}, Sequence<6>{}));
+            }
+            else
+            {
+                // K1 > single tile (KPerWmmaBlk)
+                // Each thread will load KPerWmmaBlk for the WMMA instruction
+                // Since K1 > single tile, each thread loads KPerWmmaBlk and the next
+                // KPerWmmaBlk chunk is loaded by a different thread in the same wave (WMMA layout).
+                // This layout is needed to support for example AK1 > single tile and
+                // BK1 <= single tile in the same gemm
+                // KPack / KPerWmmaBlk / KRow - MNRepeat - K0 / KRow - MNWaves - KRow - MNPerWmma -
+                // K1
+                constexpr auto desc1 = transform_tensor_descriptor(
+                    BlockDesc{},
+                    make_tuple(
+                        make_pass_through_transform(Number<ABK0>{}),
+                        make_unmerge_transform(
+                            make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
+                        make_unmerge_transform(make_tuple(Number<ABK1 / KPack>{},
+                                                          Number<KPack / KPerWmmaBlk / KRow>{},
+                                                          Number<KRow>{},
+                                                          Number<KPerWmmaBlk>{}))),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<2>{}, Sequence<1, 4, 6>{}, Sequence<3, 0, 5, 7>{}));
+
+                return transform_tensor_descriptor(
+                    desc1,
+                    make_tuple(
+                        make_pass_through_transform(Number<KPack / KPerWmmaBlk / KRow>{}),
+                        make_pass_through_transform(Number<MNRepeat>{}),
+                        make_merge_transform(make_tuple(Number<ABK0>{}, Number<ABK1 / KPack>{})),
+                        make_pass_through_transform(Number<MNWaves>{}),
+                        make_pass_through_transform(Number<KRow>{}),
+                        make_pass_through_transform(Number<MNPerWmma>{}),
+                        make_pass_through_transform(Number<KPerWmmaBlk>{})),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2, 3>{},
+                               Sequence<4>{},
+                               Sequence<5>{},
+                               Sequence<6>{},
+                               Sequence<7>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{},
+                               Sequence<6>{}));
+            }
+        }
     }
 
     __device__ static constexpr auto GetBlockStep()
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_ab_transfer_wave_tiles.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_ab_transfer_wave_tiles.hpp
index 68476ef3bf..a36ccd43ca 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_ab_transfer_wave_tiles.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_ab_transfer_wave_tiles.hpp
@@ -313,14 +313,16 @@ struct ABTransferWaveTiles
         // This is a block descriptor used to read LDS memory into register
         // It's defined in a way consistent with the existing implementation to
         // avoid changes in the pipelines
-        return make_naive_tensor_descriptor(make_tuple(Number<KPerBlock / KPack>{},
+        return make_naive_tensor_descriptor(make_tuple(I1,
                                                        Number<MNRepeat>{},
+                                                       Number<KPerBlock / KPack>{},
                                                        Number<MNWaves>{},
                                                        Number<MNKRow>{},
                                                        Number<MNPerWmma>{},
                                                        Number<ABK1Value>{}),
-                                            make_tuple(Number<KPack * MNPerWmma>{},
+                                            make_tuple(I0,
                                                        Number<KPerBlock * MNPerWmma * MNWaves>{},
+                                                       Number<KPack * MNPerWmma>{},
                                                        Number<KPerBlock * MNPerWmma>{},
                                                        Number<MNPerWmma * ABK1Value>{},
                                                        Number<ABK1Value>{},
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp
index fa7eb4faaa..38ebdab65e 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_gemm_wmma_cshuffle_v3.hpp
@@ -109,9 +109,20 @@ struct GridwiseBatchedGemmGemm_wmma_cshuffle_v3
     static constexpr auto LWaves = LPerBlock / (LRepeat * LPerWmma);
     static constexpr auto NWaves = NPerBlock / (NRepeat * NPerWmma);
 
-    // TODO: I am pretty sure this is always 16 and *should* always be 16.
-    static constexpr auto KPack =
-        math::integer_least_multiple(math::integer_least_multiple(AK1Value, BK1Value), 16);
+    static constexpr index_t KPerWmmaBlk =
+        WmmaSelector<ADataType, B0DataType, Acc0DataType, MPerWmma, LPerWmma>::selected_wmma
+            .k_per_blk;
+
+    static constexpr index_t KInnerA = ck::math::integer_divide_ceil(AK1Value, KPerWmmaBlk);
+
+    static constexpr index_t KInnerB = ck::math::integer_divide_ceil(BK1Value, KPerWmmaBlk);
+
+    static constexpr index_t KInner = ck::math::min(KInnerA, KInnerB);
+
+    static constexpr index_t KPack =
+        KInner *
+        WmmaSelector<ADataType, B0DataType, Acc0DataType, MPerWmma, LPerWmma>::selected_wmma
+            .k_per_wmma;
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
@@ -201,54 +212,115 @@ struct GridwiseBatchedGemmGemm_wmma_cshuffle_v3
         return b1_block_copy_step;
     }
 
+    template <index_t MNRepeat, index_t MNWaves, index_t MNPerWmma, typename BlockDesc>
+    __host__ __device__ static constexpr auto MakeWmmaTileDescriptor(const BlockDesc&)
+    {
+        // K0_MN_K1 -> K0_MNRepeat_MNWaves_KRow_MNPerWmma_K1
+        constexpr auto K0 = BlockDesc{}.GetLength(I0);
+        constexpr auto K1 = BlockDesc{}.GetLength(I2);
+#ifdef __gfx12__
+        constexpr auto KRow = I2;
+#else
+        constexpr auto KRow = I1;
+#endif
+
+        if constexpr(KInner > 1)
+        {
+            // KPack = KInner * KPerWmma
+            // K1 = KInner * KPerWmmaBlk
+            // Each thread loads multiple tiles with one instruction
+            // 1 - MNRepeat - K0 / KRow - MNWaves - KRow - MNPerWmma - K1
+            return transform_tensor_descriptor(
+                BlockDesc{},
+                make_tuple(
+                    make_unmerge_transform(make_tuple(Number<K0 / (KRow)>{}, KRow, Number<1>{})),
+                    make_unmerge_transform(
+                        make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
+                    make_pass_through_transform(Number<K1>{})),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<2, 4, 0>{}, Sequence<1, 3, 5>{}, Sequence<6>{}));
+        }
+        else
+        {
+            // KPack = KPerWmma (KInner == 1)
+            if constexpr(K1 <= KPerWmmaBlk)
+            {
+                // K1 <= single tile (KPerWmmaBlk)
+                // Each thread will load KPerWmmaBlk for the WMMA instruction
+                // Since K1 <= single tile, K0 is unmerged first over KPack / KRow / K1
+                // (rest of the single WMMA tile for single thread) and then over KRow
+                // (rest of the single WMMA tile for single wave)
+                // KPack / KRow / K1 - MNRepeat - K0 / KRow - MNWaves - KRow - MNPerWmma - K1
+                return transform_tensor_descriptor(
+                    BlockDesc{},
+                    make_tuple(make_unmerge_transform(make_tuple(
+                                   Number<K0 / (KPack / K1)>{}, KRow, Number<KPack / KRow / K1>{})),
+                               make_unmerge_transform(make_tuple(
+                                   Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
+                               make_pass_through_transform(Number<K1>{})),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<2, 4, 0>{}, Sequence<1, 3, 5>{}, Sequence<6>{}));
+            }
+            else
+            {
+                // K1 > single tile (KPerWmmaBlk)
+                // Each thread will load KPerWmmaBlk for the WMMA instruction
+                // Since K1 > single tile, each thread loads KPerWmmaBlk and the next
+                // KPerWmmaBlk chunk is loaded by a different thread in the same wave (WMMA layout).
+                // This layout is needed to support for example AK1 > single tile and
+                // BK1 <= single tile in the same gemm
+                // KPack / KPerWmmaBlk / KRow - MNRepeat - K0 / KRow - MNWaves - KRow - MNPerWmma -
+                // K1
+                constexpr auto desc1 = transform_tensor_descriptor(
+                    BlockDesc{},
+                    make_tuple(
+                        make_pass_through_transform(Number<K0>{}),
+                        make_unmerge_transform(
+                            make_tuple(Number<MNRepeat>{}, Number<MNWaves>{}, Number<MNPerWmma>{})),
+                        make_unmerge_transform(make_tuple(Number<K1 / KPack>{},
+                                                          Number<KPack / KPerWmmaBlk / KRow>{},
+                                                          Number<KRow>{},
+                                                          Number<KPerWmmaBlk>{}))),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<2>{}, Sequence<1, 4, 6>{}, Sequence<3, 0, 5, 7>{}));
+
+                return transform_tensor_descriptor(
+                    desc1,
+                    make_tuple(make_pass_through_transform(Number<KPack / KPerWmmaBlk / KRow>{}),
+                               make_pass_through_transform(Number<MNRepeat>{}),
+                               make_merge_transform(make_tuple(Number<K0>{}, Number<K1 / KPack>{})),
+                               make_pass_through_transform(Number<MNWaves>{}),
+                               make_pass_through_transform(Number<KRow>{}),
+                               make_pass_through_transform(Number<MNPerWmma>{}),
+                               make_pass_through_transform(Number<KPerWmmaBlk>{})),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2, 3>{},
+                               Sequence<4>{},
+                               Sequence<5>{},
+                               Sequence<6>{},
+                               Sequence<7>{}),
+                    make_tuple(Sequence<0>{},
+                               Sequence<1>{},
+                               Sequence<2>{},
+                               Sequence<3>{},
+                               Sequence<4>{},
+                               Sequence<5>{},
+                               Sequence<6>{}));
+            }
+        }
+    }
+
     template <typename ABlockDesc_>
     __host__ __device__ static constexpr auto MakeAWaveDescriptor(const ABlockDesc_&)
     {
-        constexpr auto a_wave_desc = [&]() {
-            // AK0_M_AK1 -> AK0_MRepeat_Mwaves_AKRow_MPerWmma_AK1
-            constexpr auto A_K0 = ABlockDesc_{}.GetLength(I0);
-            constexpr auto A_K1 = ABlockDesc_{}.GetLength(I2);
-#ifdef __gfx12__
-            constexpr auto A_KRow = I2;
-#else
-            constexpr auto A_KRow = I1;
-#endif
-            return transform_tensor_descriptor(
-                ABlockDesc_{},
-                make_tuple(make_unmerge_transform(make_tuple(Number<A_K0 / A_KRow>{}, A_KRow)),
-                           make_unmerge_transform(
-                               make_tuple(Number<MRepeat>{}, Number<MWaves>{}, Number<MPerWmma>{})),
-                           make_pass_through_transform(Number<A_K1>{})),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
-        }();
-
-        return a_wave_desc;
+        return MakeWmmaTileDescriptor<MRepeat, MWaves, MPerWmma>(ABlockDesc_{});
     }
 
     template <typename B0BlockDesc_>
     __host__ __device__ static constexpr auto MakeB0WaveDescriptor(const B0BlockDesc_&)
     {
-        constexpr auto b0_wave_desc = [&]() {
-            // BK0_L_BK1 -> BK0_LRepeat_Lwaves_BKRow_LPerWmma_BK1
-            constexpr auto B_K0 = B0BlockDesc_{}.GetLength(I0);
-            constexpr auto B_K1 = B0BlockDesc_{}.GetLength(I2);
-#ifdef __gfx12__
-            constexpr auto B_KRow = I2;
-#else
-            constexpr auto B_KRow = I1;
-#endif
-            return transform_tensor_descriptor(
-                B0BlockDesc_{},
-                make_tuple(make_unmerge_transform(make_tuple(Number<B_K0 / B_KRow>{}, B_KRow)),
-                           make_unmerge_transform(
-                               make_tuple(Number<LRepeat>{}, Number<LWaves>{}, Number<LPerWmma>{})),
-                           make_pass_through_transform(Number<B_K1>{})),
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
-                make_tuple(Sequence<0, 3>{}, Sequence<1, 2, 4>{}, Sequence<5>{}));
-        }();
-
-        return b0_wave_desc;
+        return MakeWmmaTileDescriptor<LRepeat, LWaves, LPerWmma>(B0BlockDesc_{});
     }
 
     template <typename A1BlockDesc_AL0_M_AL1>
@@ -356,6 +428,7 @@ struct GridwiseBatchedGemmGemm_wmma_cshuffle_v3
                                 MRepeat,
                                 LRepeat,
                                 KPack,
+                                KInner,
                                 true>())>; // TransposeC (must be true to work), C' = B' x A'
 
     // block_id to matrix tile idx (m0, n0) mapping is controlled by {M01, N01}
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
index 7a5e324468..56f09cee96 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_wmma_cshuffle_v3_common.hpp
@@ -151,10 +151,20 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
     static constexpr auto AK1Number = Number<AK1Value>{};
     static constexpr auto BK1Number = Number<BK1Value>{};
 
-    static constexpr index_t KPack = math::max(
-        math::lcm(AK1Number, BK1Number),
+    static constexpr index_t KPerWmmaBlk =
         WmmaSelector<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma>::selected_wmma
-            .k_per_wmma);
+            .k_per_blk;
+
+    static constexpr index_t KInnerA = ck::math::integer_divide_ceil(AK1Value, KPerWmmaBlk);
+
+    static constexpr index_t KInnerB = ck::math::integer_divide_ceil(BK1Value, KPerWmmaBlk);
+
+    static constexpr index_t KInner = ck::math::min(KInnerA, KInnerB);
+
+    static constexpr index_t KPack =
+        KInner *
+        WmmaSelector<ComputeTypeA, ComputeTypeB, AccDataType, MPerWmma, NPerWmma>::selected_wmma
+            .k_per_wmma;
 
     using ThisThreadBlock = ThisThreadBlock<BlockSize>;
 
@@ -218,6 +228,9 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
                               KPerBlock,
                               MPerWmma,
                               AK1Value,
+                              KPack,
+                              KInner,
+                              KPerWmmaBlk,
                               UseBlockPaddingA,
                               PermuteA,
                               ABlockTransferThreadClusterLengths_AK0_M_AK1,
@@ -251,6 +264,9 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
                               KPerBlock,
                               NPerWmma,
                               BK1Value,
+                              KPack,
+                              KInner,
+                              KPerWmmaBlk,
                               UseBlockPaddingB,
                               PermuteB,
                               BBlockTransferThreadClusterLengths_BK0_N_BK1,
@@ -563,7 +579,8 @@ struct GridwiseGemm_wmma_cshuffle_v3_base
                                                            NPerWmma,
                                                            MRepeat,
                                                            NRepeat,
-                                                           KPack>())>;
+                                                           KPack,
+                                                           KInner>())>;
 
     // Used to create obj in global function and pass it to Run method
     using EpilogueCShuffle =
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
index 3d2ef9b6c4..7c5bd606b2 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
@@ -429,8 +429,8 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
         constexpr index_t MWave           = MPerBlock / (MXdlPerWave * MPerXdl);
         constexpr index_t WaveSize        = BlockSize / (MWave * NWave);
         constexpr index_t NkSwizzleNumber = Number<WaveSize * KPack>{};
-        return make_naive_tensor_descriptor_packed(
-            make_tuple(N0 / NWave / NXdlPack, NWave, NXdlPack, K0, NkSwizzleNumber));
+        return make_naive_tensor_descriptor_packed(make_tuple(
+            math::integer_divide_ceil(N0, NWave * NXdlPack), NWave, NXdlPack, K0, NkSwizzleNumber));
     }
 
     __host__ __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
index c367079aab..1c471fb873 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bpreshuffle.hpp
@@ -48,28 +48,25 @@ __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
     kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
 {
 #if defined(__gfx9__)
-    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
-    {
-        __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
 
-        auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
+    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
 
-        GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-            karg.p_sorted_token_ids,
-            karg.p_sorted_expert_ids,
-            karg.p_max_token_id,
-            karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-            karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
-            karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-            karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
-            karg.p_ds_grid,
-            karg.p_c_grid,
-            p_shared,
-            karg,
-            karg.a_element_op,
-            karg.b_element_op,
-            karg.c_element_op);
-    }
+    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
+        karg.p_sorted_token_ids,
+        karg.p_sorted_expert_ids,
+        karg.p_max_token_id,
+        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
+        karg.p_a_scale_grid + splitk_batch_offset.a_scale_k_split_offset,
+        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
+        karg.p_b_scale_grid + splitk_batch_offset.b_scale_k_split_offset,
+        karg.p_ds_grid,
+        karg.p_c_grid,
+        p_shared,
+        karg,
+        karg.a_element_op,
+        karg.b_element_op,
+        karg.c_element_op);
 #else
     ignore = karg;
 #endif // end of if (defined(__gfx9__))
@@ -1249,7 +1246,6 @@ struct GridwiseMoeGemmMX_BPreshuffle
     __host__ static constexpr bool CalculateHasMainKBlockLoop(index_t K)
     {
         const index_t num_loop = K / KPerBlock;
-
         return BlockwiseGemmPipe::BlockHasHotloop(num_loop);
     }
 
@@ -1279,7 +1275,6 @@ struct GridwiseMoeGemmMX_BPreshuffle
     // using Block2CTileMapDefault = BlockToCTileMap_Grouped_M00_N0_M01Adapt<8, MPerBlock,
     // NPerBlock>;
 
-#if 0
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
@@ -1298,9 +1293,10 @@ struct GridwiseMoeGemmMX_BPreshuffle
                                BElementwiseOperation b_element_op,
                                CElementwiseOperation c_element_op)
     {
+        ignore                           = a_element_op;
         ignore                           = b_element_op;
-        index_t BN0Shuffled = CalculateBN0Shuffled(problem.N);
-        index_t BK0Shuffled = CalculateBK0Shuffled(problem.K);        
+        index_t BN0Shuffled              = CalculateBN0Shuffled(problem.N);
+        index_t BK0Shuffled              = CalculateBK0Shuffled(problem.K);
         const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
             IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
             problem.MPadded,
@@ -1317,29 +1313,41 @@ struct GridwiseMoeGemmMX_BPreshuffle
             problem.NPadded,
             problem.StrideC);
 
-        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
-            make_tuple((IsInputGemm ? problem.NumTokens : problem.M) / (MXdlPack * MPerBlock),
+        // We pad the M unconditionaly for Scale
+        const auto Padded_Scale_M =
+            math::integer_divide_ceil(problem.M, ScaleBlockSize) * ScaleBlockSize;
+        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor(
+            make_tuple(Padded_Scale_M / (MXdlPack * MPerXdl),
                        math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
                            (KXdlPack * 64 / MPerXdl),
-                       64 * KXdlPack * MXdlPack / scale_pack_size_a));
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / APackedSize)) *
+                           MPerXdl * MXdlPack / scale_pack_size_a,
+                       64 * KXdlPack * MXdlPack / scale_pack_size_a,
+                       1));
 
-        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed(
+        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor(
             make_tuple(problem.N / (NXdlPack * NPerXdl),
                        math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
                            (KXdlPack * 64 / NPerXdl),
-                       64 * KXdlPack * NXdlPack / scale_pack_size_b));
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b),
+            make_tuple(math::integer_divide_ceil(problem.K * problem.KBatch,
+                                                 (ScaleBlockSize / BPackedSize)) *
+                           NPerXdl * NXdlPack / scale_pack_size_b,
+                       64 * KXdlPack * NXdlPack / scale_pack_size_b,
+                       1));
 
         const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-        const index_t max_token_id = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
-        // static_assert(NSwizzle == false, "to do fix: need another pr in sorting merged");
+
+        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
         const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
         if(expert_block_id * MPerBlock >= max_token_id)
             return;
         const index_t expert_id =
             __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
-
         const auto block_mn = [&]() -> std::pair<int, int> {
             if constexpr(NSwizzle)
             {
@@ -1372,86 +1380,78 @@ struct GridwiseMoeGemmMX_BPreshuffle
         constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
         constexpr auto AKThreads  = AK0Threads * AK1Threads;
         constexpr auto AMRepeats  = MPerBlock / AMThreads;
-        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
+        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads;
 
         if(token_pos >= max_token_id || token0 >= problem.NumTokens)
             return;
         StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
         static_for<0, AMRepeats, 1>{}([&](auto m0) {
-            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
+            const index_t fused_token = p_sorted_token_ids[token_pos + m0 * AMThreads];
             index_t token_offset      = fused_token & 0xffffff;
             if constexpr(!IsInputGemm)
             {
                 token_offset = token_offset * problem.TopK + (fused_token >> 24);
             }
-            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K / APackedSize;
+            gather_offsets(m0) = static_cast<IndexType>(token_offset) * problem.K;
         });
+
         const index_t expert_stride =
             __builtin_amdgcn_readfirstlane(problem.N * problem.K * (IsInputGemm ? 2 : 1));
-        const index_t expert_scale_stride =
-            __builtin_amdgcn_readfirstlane(problem.N * (IsInputGemm ? 2 : 1) *
-                                           math::integer_divide_ceil(problem.K, ScaleBlockSize));
+        const index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
+            problem.N * (IsInputGemm ? 2 : 1) *
+            math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
 
         // N0, K0, Blocksize*KPack
         const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave);
+            __builtin_amdgcn_readfirstlane(block_n_id * NXdlPerWave / NXdlPack);
 
+        // Gride buffer creation
         const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
         const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid + expert_id * expert_stride / BPackedSize,
-            b_grid_desc_bpreshuffled.GetElementSpaceSize());
+            p_b_grid + expert_id * expert_stride, b_grid_desc_bpreshuffled.GetElementSpaceSize());
 
         // A, B scale buffer
         const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
             p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
         const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_scale_grid + expert_id * expert_scale_stride,
+            p_b_scale_grid + (expert_id * expert_scale_stride) / sizeof(BScaleDataType),
             b_scale_grid_desc_bn_ak.GetElementSpaceSize());
 
         // A matrix in LDS memory, dst of blockwise copy
         constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
 
         // B matrix in LDS memory, dst of blockwise copy
-        // dummy
         constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-        // A matrix blockwise copy
-        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_v4r1_gather<
+
+        // A matrix blockwise direct to LDS copy
+        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_Gather_DirectLoad<
             ThisThreadBlock,
-            AElementwiseOperation,
-            ck::tensor_operation::element_wise::PassThrough,
-            InMemoryDataOperationEnum::Set,
             Sequence<AK0Number, MPerBlock, AK1Number>,
             ABlockTransferThreadClusterLengths_AK0_M_AK1,
             ABlockTransferThreadClusterArrangeOrder,
             ADataType,
-            LDSTypeA,
+            ADataType,
             decltype(a_grid_desc_ak0_m_ak1),
             decltype(a_block_desc_ak0_m_ak1),
             ABlockTransferSrcAccessOrder,
-            Sequence<0, 1, 2>,
             ABlockTransferSrcVectorDim,
             2,
             ABlockTransferSrcScalarPerVector,
-            ABlockTransferDstScalarPerVector_AK1,
-            1,
-            1,
-            AThreadTransferSrcResetCoordinateAfterRun,
-            true,
             IndexType,
-            1,
-            BlockwiseGemmPipe::GlobalBufferNum>(a_grid_desc_ak0_m_ak1,
-                                                make_multi_index(0, 0, 0),
-                                                a_element_op,
-                                                a_block_desc_ak0_m_ak1,
-                                                make_multi_index(0, 0, 0),
-                                                ck::tensor_operation::element_wise::PassThrough{},
-                                                gather_offsets);
+            1>(a_grid_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               a_block_desc_ak0_m_ak1,
+               make_multi_index(0, 0, 0),
+               gather_offsets);
 
         // Thread-wise copy
         // K0 -> N0/NWave -> NWave -> KLane -> NLane -> KPack
-        auto b_block_buf = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+        auto b_block_buf_ping = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
             b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_buf_pong = make_static_buffer<AddressSpaceEnum::Vgpr, BDataType>(
+            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
+        auto b_block_bufs = make_tuple(b_block_buf_ping, b_block_buf_pong);
 
         auto b_blockwise_copy =
             ThreadwiseTensorSliceTransfer_v2<BDataType,
@@ -1463,7 +1463,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
                                                       Number<NXdlPack>{},
                                                       Number<KRepeat>{},
                                                       Number<BK1Value>{}>,
-                                             Sequence<1, 2, 0, 3>,
+                                             Sequence<0, 1, 2, 3, 4>,
                                              4,
                                              BBlockTransferSrcScalarPerVector,
                                              BThreadTransferSrcResetCoordinateAfterRun,
@@ -1472,16 +1472,16 @@ struct GridwiseMoeGemmMX_BPreshuffle
                 make_multi_index(n_block_data_idx_on_grid,
                                  get_warp_local_1d_id() % NWave,
                                  0,
-                                 KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
+                                 0,
+                                 KPack * (get_thread_local_1d_id() % WarpSize)));
 
         // LDS allocation for A and B: be careful of alignment
         // Cast after lds
         auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<LDSTypeA*>(p_shared),
-            a_block_desc_ak0_m_ak1.GetElementSpaceSize() / APackedSize);
+            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
 
         constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, KRepeat, 0);
+        constexpr auto b_block_slice_copy_step = make_multi_index(0, 0, 0, KRepeat, 0);
 
         // Blockwise GEMM pipeline
         static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
@@ -1505,13 +1505,16 @@ struct GridwiseMoeGemmMX_BPreshuffle
         const auto waveId_m = wave_idx[I0];
         const auto waveId_n = wave_idx[I1];
 
-        static constexpr auto mfma = BlockwiseGemmPipe::xdlops_gemm.mfma;
-
         auto thread_offset_shuffled =
             get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
 
         auto a_thread_offset_m = waveId_m;
 
+        // get each thread's offset int the scale tensor
+        const index_t token_scale_pos = block_m_id * MPerBlock;
+        if(token_scale_pos >= max_token_id || token0 >= problem.NumTokens)
+            return;
+
         auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
             AScaleDataType,
             AScaleDataType,
@@ -1538,7 +1541,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
             Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
             Sequence<0, 1, 2>,                                       // DimAccessOrder
             2,                                                       // SrcVectorDim
-            KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
+            KXdlPack * NXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
             1,                                                       // SrcScalarStrideInVector
             true>(b_scale_grid_desc_bn_ak,
                   make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
@@ -1547,29 +1550,37 @@ struct GridwiseMoeGemmMX_BPreshuffle
 
         if constexpr(IsInputGemm)
         {
-            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2 / BPackedSize;
+            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2;
             const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_grid_up + expert_id * expert_stride / BPackedSize,
+                p_b_grid_up + expert_id * expert_stride,
                 b_grid_desc_bpreshuffled.GetElementSpaceSize());
-            auto b_blockwise_copy_up = ThreadwiseTensorSliceTransfer_v2<
-                BDataType,
-                BDataType,
-                decltype(b_grid_desc_bpreshuffled),
-                decltype(b_block_desc_bk0_n_bk1),
-                Sequence<Number<NXdlPerWave>{}, I1, Number<KRepeat>{}, Number<BK1Value>{}>,
-                Sequence<1, 2, 0, 3>,
-                3,
-                BBlockTransferSrcScalarPerVector,
-                BThreadTransferSrcResetCoordinateAfterRun,
-                true>(b_grid_desc_bpreshuffled,
-                      make_multi_index(n_block_data_idx_on_grid,
-                                       get_warp_local_1d_id() % NWave,
-                                       0,
-                                       KPack / KGroup * (get_thread_local_1d_id() % WarpSize)));
-            const BScaleDataType* p_b_scale_grid_up = p_b_scale_grid + expert_scale_stride / 2;
-            const auto b_scale_grid_buf_up          = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_scale_grid_up + expert_id * expert_scale_stride,
+            auto b_blockwise_copy_up =
+                ThreadwiseTensorSliceTransfer_v2<BDataType,
+                                                 BDataType,
+                                                 decltype(b_grid_desc_bpreshuffled),
+                                                 decltype(b_block_desc_bk0_n_bk1),
+                                                 Sequence<Number<NXdlPerWave / NXdlPack>{},
+                                                          I1,
+                                                          Number<NXdlPack>{},
+                                                          Number<KRepeat>{},
+                                                          Number<BK1Value>{}>,
+                                                 Sequence<0, 1, 2, 3, 4>,
+                                                 4,
+                                                 BBlockTransferSrcScalarPerVector,
+                                                 BThreadTransferSrcResetCoordinateAfterRun,
+                                                 true>(
+                    b_grid_desc_bpreshuffled,
+                    make_multi_index(n_block_data_idx_on_grid,
+                                     get_warp_local_1d_id() % NWave,
+                                     0,
+                                     0,
+                                     KPack * (get_thread_local_1d_id() % WarpSize)));
+            const BScaleDataType* p_b_scale_grid_up =
+                p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType);
+            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                p_b_scale_grid_up + expert_id * expert_scale_stride / sizeof(BScaleDataType),
                 b_scale_grid_desc_bn_ak.GetElementSpaceSize());
+
             auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
                 BScaleDataType,
                 BScaleDataType,
@@ -1587,25 +1598,30 @@ struct GridwiseMoeGemmMX_BPreshuffle
                                  thread_offset_shuffled / scale_pack_size_b));
 
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
+                // A
                 a_grid_desc_ak0_m_ak1,
                 a_block_desc_ak0_m_ak1,
                 a_blockwise_copy,
                 a_grid_buf,
                 a_block_buf,
                 a_block_slice_copy_step,
+                // Gate and Up
                 b_grid_desc_bpreshuffled,
                 b_block_desc_bk0_n_bk1,
                 b_blockwise_copy,
                 b_blockwise_copy_up,
                 b_grid_buf,
                 b_grid_buf_up,
-                b_block_buf,
+                b_block_bufs,
                 b_block_slice_copy_step,
+                // C
                 c_thread_buf,
                 c_thread_buf_up,
+                // A scale
                 a_scale_grid_desc_am_ak,
                 a_scale_thread_copy,
                 a_scale_grid_buf,
+                // B scale
                 b_scale_grid_desc_bn_ak,
                 b_scale_thread_copy,
                 b_scale_thread_copy_up,
@@ -1616,23 +1632,23 @@ struct GridwiseMoeGemmMX_BPreshuffle
         else
         {
             blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
-                a_grid_desc_ak0_m_ak1,
+                a_grid_desc_ak0_m_ak1, // A
                 a_block_desc_ak0_m_ak1,
                 a_blockwise_copy,
                 a_grid_buf,
                 a_block_buf,
                 a_block_slice_copy_step,
-                b_grid_desc_bpreshuffled,
+                b_grid_desc_bpreshuffled, // B
                 b_block_desc_bk0_n_bk1,
                 b_blockwise_copy,
                 b_grid_buf,
-                b_block_buf,
+                b_block_bufs,
                 b_block_slice_copy_step,
-                c_thread_buf,
-                a_scale_grid_desc_am_ak,
+                c_thread_buf,            // C
+                a_scale_grid_desc_am_ak, // A scale
                 a_scale_thread_copy,
                 a_scale_grid_buf,
-                b_scale_grid_desc_bn_ak,
+                b_scale_grid_desc_bn_ak, // B scale
                 b_scale_thread_copy,
                 b_scale_grid_buf,
                 num_k_block_main_loop);
@@ -1643,84 +1659,101 @@ struct GridwiseMoeGemmMX_BPreshuffle
             static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
                               NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
                           "wrong!");
+            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
+                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
+                          "wrong!");
+
+            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
 
             // TODO: hacky, fix it!
             constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
-                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             // TODO: hacky, fix it!
             // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
             constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
-                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2();
+                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
 
             constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
             constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
             constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
             constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
             constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
-            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
-            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
-            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
+            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
+            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
+            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
+            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
 
             // mul scales
-            static_assert(M0 * M1 * M2 * M3 * M4 == MPerBlock);
-            static_assert(M4 == 4);
+
+            static_assert(M0 * M1 * M2 * M3 * M4 * M5 == MPerBlock);
+            static_assert(M5 == 4);
             const index_t m1 = get_warp_local_1d_id() / NWave;
-            const index_t m3 = threadIdx.x % get_warp_size() / MPerXdl;
+            const index_t m4 = threadIdx.x % get_warp_size() / MPerXdl;
 
             vector_type<float, 4> topk_weights; // for gemm2 only
-            static_for<0, NXdlPerWave, 1>{}([&](auto n0) {
-                static_for<0, MXdlPerWave, 1>{}([&](auto m0) { // MXDLPerWave
-                    static_for<0, M2, 1>{}([&](auto m2) {      // m_inst_num_groups_per_blk
-                        const index_t m_pos = block_m_id * MPerBlock + m0 * M1 * M2 * M3 * M4 +
-                                              m1 * M2 * M3 * M4 + m2 * M3 * M4 + m3 * M4;
-                        if constexpr(MulRoutedWeight)
-                        {
-                            topk_weights = *c_style_pointer_cast<const vector_type<float, M4>*>(
-                                p_ds_grid[I2] + m_pos);
-                        }
-                        static_for<0, M4, 1>{}([&](auto m4) { // m_inst_group_size
-                            constexpr index_t c_offset =
-                                blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
-                                    make_tuple(m0, n0, m2 * M4 + m4));
-                            constexpr auto cidx = Number<c_offset>{};
-
-                            if constexpr(IsInputGemm) // gu fusion
-                            {
-                                if constexpr(ActivationOperation == Activation::silu_and_mul)
-                                {
-                                    float gate = c_thread_buf[cidx];
-                                    float up   = c_thread_buf_up[cidx];
-                                    if constexpr(MulRoutedWeight)
-                                    {
-                                        gate = gate * topk_weights.AsType<float>()[m4];
-                                        up   = up * topk_weights.AsType<float>()[m4];
-                                    }
-                                    tensor_operation::element_wise::Silu{}(gate, gate);
-                                    c_thread_buf_fp32(cidx) = gate * up;
-                                }
-                                else if(ActivationOperation == Activation::gelu_and_mul)
-                                {
-                                    float gate = c_thread_buf[cidx];
-                                    float up   = c_thread_buf_up[cidx];
-                                    if constexpr(MulRoutedWeight)
-                                    {
-                                        gate = gate * topk_weights.AsType<float>()[m4];
-                                        up   = up * topk_weights.AsType<float>()[m4];
-                                    }
-                                    tensor_operation::element_wise::Gelu{}(gate, gate);
-                                    c_thread_buf_fp32(cidx) = gate * up;
-                                }
-                            }
-                            else
-                            {
-                                c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+            static_for<0, NXdlPerWave / NXdlPack, 1>{}([&](auto n0) {
+                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {                // NXdlPack
+                    static_for<0, MXdlPerWave / MXdlPack, 1>{}([&](auto m0) { // MXDLPerWave
+                        static_for<0, MXdlPack, 1>{}([&](auto imxdl) {        // MXdlPack
+                            static_for<0, M3, 1>{}([&](auto m3) { // m_inst_num_groups_per_blk
+                                const index_t m_pos = block_m_id * MPerBlock +
+                                                      m0 * M2 * M1 * M3 * M4 * M5 +
+                                                      m1 * M2 * M3 * M4 * M5 +
+                                                      imxdl * M3 * M4 * M5 + m3 * M4 * M5 + m4 * M5;
                                 if constexpr(MulRoutedWeight)
                                 {
-                                    c_thread_buf_fp32(cidx) =
-                                        topk_weights.AsType<float>()[m4] * c_thread_buf_fp32[cidx];
+                                    topk_weights =
+                                        *c_style_pointer_cast<const vector_type<float, M5>*>(
+                                            p_ds_grid[I2] + m_pos);
                                 }
-                            }
+                                static_for<0, M5, 1>{}([&](auto m5) { // m_inst_group_size
+                                    constexpr index_t c_offset =
+                                        blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
+                                            make_tuple(m0, n0, imxdl, inxdl, m3 * M5 + m5));
+                                    constexpr auto cidx = Number<c_offset>{};
+
+                                    if constexpr(IsInputGemm) // gu fusion
+                                    {
+                                        if constexpr(ActivationOperation ==
+                                                     Activation::silu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Silu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+                                        }
+                                        else if(ActivationOperation == Activation::gelu_and_mul)
+                                        {
+                                            float gate = c_thread_buf[cidx];
+                                            float up   = c_thread_buf_up[cidx];
+                                            if constexpr(MulRoutedWeight)
+                                            {
+                                                gate = gate * topk_weights.AsType<float>()[m5];
+                                                up   = up * topk_weights.AsType<float>()[m5];
+                                            }
+                                            tensor_operation::element_wise::Gelu{}(gate, gate);
+                                            c_thread_buf_fp32(cidx) = gate * up;
+                                        }
+                                    }
+                                    else
+                                    {
+                                        c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
+                                        if constexpr(MulRoutedWeight)
+                                        {
+                                            c_thread_buf_fp32(cidx) =
+                                                topk_weights.AsType<float>()[m5] *
+                                                c_thread_buf_fp32[cidx];
+                                        }
+                                    }
+                                });
+                            });
                         });
                     });
                 });
@@ -1738,19 +1771,25 @@ struct GridwiseMoeGemmMX_BPreshuffle
                 make_tuple(
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleMXdlPerWavePerShuffle>{}, // M0 (MXdlPerWave) per shuffle
-                        M1,                                      // M1 = MWave
-                        M2,                                      // M2 * M3 * M4 = MPerXdl
+                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave) per
+                                                                            // shuffle
+                        M1,                                                 // M1 = MWave
+                        M2, // M2 * M3 * M4 = MPerXdl
                         M3,
-                        M4)),
+                        M4,
+                        M5)),
                     make_freeze_transform(I0),
                     make_unmerge_transform(make_tuple(
-                        Number<CShuffleNXdlPerWavePerShuffle>{}, // N0 (NXdlPerWave) per shuffle
-                        N1,                                      // N1 = NWave
-                        N2))),                                   // N2 = NPerXdl
+                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave)
+                                                                            // per shuffle
+                        N1,                                                 // N1 = NWave
+                        N2,                                                 // N2 = NXdlPack
+                        N3))),                                              // N3 = NPerXdl
                 make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(
-                    Sequence<>{}, Sequence<0, 2, 4, 5, 6>{}, Sequence<>{}, Sequence<1, 3, 7>{}));
+                make_tuple(Sequence<>{},
+                           Sequence<0, 2, 4, 6, 7, 8>{},
+                           Sequence<>{},
+                           Sequence<1, 3, 5, 9>{}));
 
             // calculate origin of thread output tensor on global memory
             //     blockwise GEMM c matrix starting index
@@ -1762,8 +1801,8 @@ struct GridwiseMoeGemmMX_BPreshuffle
 
             const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4))),
-                    make_tuple(Sequence<0, 1, 2, 3, 4>{}),
+                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
+                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto m_thread_data_on_block_idx =
@@ -1772,8 +1811,8 @@ struct GridwiseMoeGemmMX_BPreshuffle
 
             const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
                 make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2))),
-                    make_tuple(Sequence<0, 1, 2>{}),
+                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
+                    make_tuple(Sequence<0, 1, 2, 3>{}),
                     make_tuple(Sequence<0>{}));
 
             const auto n_thread_data_on_block_idx =
@@ -1781,36 +1820,39 @@ struct GridwiseMoeGemmMX_BPreshuffle
                     make_multi_index(n_thread_data_on_block));
 
             // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds =
-                ThreadwiseTensorSliceTransfer_v1r3<AccDataType,
-                                                   CShuffleDataType,
-                                                   decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                                                   ck::tensor_operation::element_wise::PassThrough,
-                                                   Sequence<CShuffleMXdlPerWavePerShuffle,
-                                                            CShuffleNXdlPerWavePerShuffle,
-                                                            I1,
-                                                            I1,
-                                                            M2,
-                                                            I1,
-                                                            M4,
-                                                            I1>,
-                                                   Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                                   7,
-                                                   1,
-                                                   InMemoryDataOperationEnum::Set,
-                                                   1,
-                                                   true>{
-                    c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                    make_multi_index(0,
-                                     0,
-                                     m_thread_data_on_block_idx[I1],
-                                     n_thread_data_on_block_idx[I1],
-                                     m_thread_data_on_block_idx[I2],
-                                     m_thread_data_on_block_idx[I3],
-                                     m_thread_data_on_block_idx[I4],
-                                     n_thread_data_on_block_idx[I2]),
-                    ck::tensor_operation::element_wise::PassThrough{}};
+            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
+                AccDataType,
+                CShuffleDataType,
+                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
+                ck::tensor_operation::element_wise::PassThrough,
+                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                         I1,
+                         I1,
+                         M2,
+                         N2,
+                         M3,
+                         I1,
+                         M5,
+                         I1>,
+                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                9,
+                1,
+                InMemoryDataOperationEnum::Set,
+                1,
+                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
+                      make_multi_index(0,
+                                       0,
+                                       m_thread_data_on_block_idx[I1],
+                                       n_thread_data_on_block_idx[I1],
+                                       m_thread_data_on_block_idx[I2],
+                                       n_thread_data_on_block_idx[I2],
+                                       m_thread_data_on_block_idx[I3],
+                                       m_thread_data_on_block_idx[I4],
+                                       m_thread_data_on_block_idx[I5],
+                                       n_thread_data_on_block_idx[I3]),
+                      ck::tensor_operation::element_wise::PassThrough{}};
 
             using EDataType = CDataType;
 
@@ -1859,7 +1901,7 @@ struct GridwiseMoeGemmMX_BPreshuffle
             using CDEBlockTransferCluster =
                 CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
             const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
-            constexpr index_t scatter_weight_idx  = 1; // hack fix felix
+            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
             auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
                    ThisThreadBlock,
                    decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
@@ -1867,8 +1909,9 @@ struct GridwiseMoeGemmMX_BPreshuffle
                    decltype(c_ds_desc_refs),
                    decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
                    CElementwiseOperation,
-                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make Sequence
-                                                                               // support arbitray type
+                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
+                                                                               // Sequence support
+                                                                               // arbitray type
                    Sequence<1,
                             CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
                             1,
@@ -1898,13 +1941,25 @@ struct GridwiseMoeGemmMX_BPreshuffle
 
             auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
                 p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
+
             constexpr auto sfc_c_vgpr =
-                SpaceFillingCurve<Sequence<MXdlPerWave, NXdlPerWave, 1, 1, M2, 1, M4, 1>,
-                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7>,
-                                  Sequence<CShuffleMXdlPerWavePerShuffle,
-                                           CShuffleNXdlPerWavePerShuffle,
+                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
+                                           NXdlPerWave / NXdlPack,
                                            1,
                                            1,
+                                           MXdlPack,
+                                           NXdlPack,
+                                           M2,
+                                           1,
+                                           M4,
+                                           1>,
+                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
+                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
+                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
+                                           1,
+                                           1,
+                                           MXdlPack,
+                                           NXdlPack,
                                            M2,
                                            1,
                                            M4,
@@ -1984,7 +2039,6 @@ struct GridwiseMoeGemmMX_BPreshuffle
             });
         }
     }
-#endif
 
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index bca68764f9..55ede990af 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -95,6 +95,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16,
     static constexpr index_t m_per_wmma      = 16;
     static constexpr index_t n_per_wmma      = 16;
     static constexpr index_t k_per_wmma      = 16;
+    static constexpr index_t k_per_blk       = 8;
     static constexpr index_t src_a_data_size = 2;
     static constexpr index_t src_b_data_size = 2;
     static constexpr index_t acc_data_size   = 4;
@@ -136,6 +137,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf16,
     static constexpr index_t m_per_wmma               = 16;
     static constexpr index_t n_per_wmma               = 16;
     static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t src_a_data_size          = 2;
     static constexpr index_t src_b_data_size          = 2;
     static constexpr index_t acc_data_size            = 4;
@@ -173,6 +175,7 @@ struct wmma_type<WmmaInstr::wmma_f16_16x16x16_f16,
     static constexpr index_t m_per_wmma               = 16;
     static constexpr index_t n_per_wmma               = 16;
     static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t src_a_data_size          = 2;
     static constexpr index_t src_b_data_size          = 2;
     static constexpr index_t acc_data_size            = 2;
@@ -209,6 +212,7 @@ struct wmma_type<WmmaInstr::wmma_bf16_16x16x16_bf16,
     static constexpr index_t m_per_wmma               = 16;
     static constexpr index_t n_per_wmma               = 16;
     static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t src_a_data_size          = 2;
     static constexpr index_t src_b_data_size          = 2;
     static constexpr index_t acc_data_size            = 2;
@@ -251,6 +255,7 @@ struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8,
     static constexpr index_t m_per_wmma               = 16;
     static constexpr index_t n_per_wmma               = 16;
     static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t src_a_data_size          = 2;
     static constexpr index_t src_b_data_size          = 2;
     static constexpr index_t acc_data_size            = 4;
@@ -301,6 +306,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
     static constexpr index_t m_per_wmma = 16;
     static constexpr index_t n_per_wmma = 16;
     static constexpr index_t k_per_wmma = 16;
+    static constexpr index_t k_per_blk  = 8;
     // static constexpr index_t src_a_data_size = 2;
     // static constexpr index_t src_b_data_size = 2;
     // static constexpr index_t acc_data_size   = 4;
@@ -339,6 +345,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf16_gfx12,
     static constexpr index_t m_per_wmma = 16;
     static constexpr index_t n_per_wmma = 16;
     static constexpr index_t k_per_wmma = 16;
+    static constexpr index_t k_per_blk  = 8;
     // static constexpr index_t src_a_data_size          = 2;
     // static constexpr index_t src_b_data_size          = 2;
     static constexpr index_t acc_data_size            = 4;
@@ -372,6 +379,7 @@ struct wmma_type<WmmaInstr::wmma_i32_16x16x16_iu8_gfx12,
     static constexpr index_t m_per_wmma = 16;
     static constexpr index_t n_per_wmma = 16;
     static constexpr index_t k_per_wmma = 16;
+    static constexpr index_t k_per_blk  = 8;
     // static constexpr index_t src_a_data_size          = 2;
     // static constexpr index_t src_b_data_size          = 2;
     static constexpr index_t acc_data_size            = 4;
@@ -413,6 +421,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f8f8_gfx12,
     static constexpr index_t m_per_wmma               = 16;
     static constexpr index_t n_per_wmma               = 16;
     static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t acc_data_size            = 4;
     static constexpr index_t acc_pack_number          = 1;
     static constexpr index_t num_thread_per_subgroups = n_per_wmma;
@@ -448,6 +457,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f8bf8_gfx12,
     static constexpr index_t m_per_wmma               = 16;
     static constexpr index_t n_per_wmma               = 16;
     static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t acc_data_size            = 4;
     static constexpr index_t acc_pack_number          = 1;
     static constexpr index_t num_thread_per_subgroups = n_per_wmma;
@@ -483,6 +493,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf8f8_gfx12,
     static constexpr index_t m_per_wmma               = 16;
     static constexpr index_t n_per_wmma               = 16;
     static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t acc_data_size            = 4;
     static constexpr index_t acc_pack_number          = 1;
     static constexpr index_t num_thread_per_subgroups = n_per_wmma;
@@ -518,6 +529,7 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_bf8bf8_gfx12,
     static constexpr index_t m_per_wmma               = 16;
     static constexpr index_t n_per_wmma               = 16;
     static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t acc_data_size            = 4;
     static constexpr index_t acc_pack_number          = 1;
     static constexpr index_t num_thread_per_subgroups = n_per_wmma;
@@ -768,6 +780,8 @@ struct WmmaGemm
 
     __device__ static constexpr index_t GetWaveSize() { return wmma_instr.wave_size; }
 
+    __device__ static constexpr index_t GetKPerWaveBlk() { return wmma_instr.k_per_blk; }
+
     template <class FloatA, class FloatB, class FloatC>
     __device__ void Run(const FloatA& p_a_wave, const FloatB& p_b_wave, FloatC& p_c_thread) const
     {
diff --git a/include/ck_tile/core/arch/arch.hpp b/include/ck_tile/core/arch/arch.hpp
index 8620e7337c..b66c00e392 100644
--- a/include/ck_tile/core/arch/arch.hpp
+++ b/include/ck_tile/core/arch/arch.hpp
@@ -136,66 +136,103 @@ CK_TILE_DEVICE void block_sync_load_raw(index_t cnt = 0)
 #endif
 }
 
-// https://llvm.org/docs/AMDGPU/gfx9_waitcnt.html
+struct WaitcntLayoutGfx12
+{ // s_wait_loadcnt_dscnt: mem[13:8], ds[5:0]
+    CK_TILE_DEVICE static constexpr index_t VM_MASK   = 0x3F; // mem
+    CK_TILE_DEVICE static constexpr index_t LGKM_MASK = 0x3F; // ds
+    CK_TILE_DEVICE static constexpr bool HAS_EXP      = false;
+
+    CK_TILE_DEVICE static constexpr index_t pack_vm(index_t c) { return ((c & VM_MASK) << 8); }
+    CK_TILE_DEVICE static constexpr index_t pack_lgkm(index_t c) { return ((c & LGKM_MASK) << 0); }
+    CK_TILE_DEVICE static constexpr index_t pack_exp(index_t) { return 0; }
+};
+
+struct WaitcntLayoutGfx11
+{ // vm[15:10] (6), lgkm[9:4] (6), exp unused
+    CK_TILE_DEVICE static constexpr index_t VM_MASK   = 0x3F;
+    CK_TILE_DEVICE static constexpr index_t LGKM_MASK = 0x3F;
+    CK_TILE_DEVICE static constexpr bool HAS_EXP      = false;
+
+    CK_TILE_DEVICE static constexpr index_t pack_vm(index_t c) { return ((c & VM_MASK) << 10); }
+    CK_TILE_DEVICE static constexpr index_t pack_lgkm(index_t c) { return ((c & LGKM_MASK) << 4); }
+    CK_TILE_DEVICE static constexpr index_t pack_exp(index_t) { return 0; }
+};
+
+struct WaitcntLayoutLegacy
+{ // FE'DC'BA98'7'654'3210 => VV'UU'LLLL'U'EEE'VVVV
+    CK_TILE_DEVICE static constexpr index_t VM_MASK   = 0x3F; // split: low4 + hi2
+    CK_TILE_DEVICE static constexpr index_t LGKM_MASK = 0x0F; // [11:8]
+    CK_TILE_DEVICE static constexpr index_t EXP_MASK  = 0x07; // [6:4]
+    CK_TILE_DEVICE static constexpr bool HAS_EXP      = true;
+
+    CK_TILE_DEVICE static constexpr index_t pack_vm(index_t c)
+    {
+        c &= VM_MASK;
+        return ((c & 0xF) << 0) | ((c & 0x30) << 10);
+    }
+    CK_TILE_DEVICE static constexpr index_t pack_lgkm(index_t c) { return ((c & LGKM_MASK) << 8); }
+    CK_TILE_DEVICE static constexpr index_t pack_exp(index_t c) { return ((c & EXP_MASK) << 4); }
+};
+
+// Select active layout
+#if defined(__gfx12__)
+using Waitcnt = WaitcntLayoutGfx12;
+#elif defined(__gfx11__)
+using Waitcnt = WaitcntLayoutGfx11;
+#else
+using Waitcnt = WaitcntLayoutLegacy;
+#endif
+
+//----------------------------------------------
+// Public API: only from_* (constexpr templates)
+//----------------------------------------------
 struct waitcnt_arg
 {
-#if defined(__gfx12__)
-    // use s_wait_loadcnt_dscnt in this instruction; in this instruction, ds [5:0]; mem [13:8]
-    CK_TILE_DEVICE static constexpr index_t MAX = 0b00'111111'00'111111;
-
-    CK_TILE_DEVICE static constexpr index_t kMaxVmCnt   = 0b111111;
-    CK_TILE_DEVICE static constexpr index_t kMaxExpCnt  = 0b111;
-    CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0b111111;
-
-    template <index_t cnt>
-    CK_TILE_DEVICE static constexpr index_t from_vmcnt()
-    {
-        static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
-        return MAX & (cnt << 8);
-    }
-
-    template <index_t cnt>
-    CK_TILE_DEVICE static constexpr index_t from_expcnt()
-    {
-        return 0; // no export in MI series
-    }
-
-    template <index_t cnt>
-    CK_TILE_DEVICE static constexpr index_t from_lgkmcnt()
-    {
-        static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
-        return MAX & cnt;
-    }
+    // kMax* exposed for callers; match field widths per-arch
+#if defined(__gfx12__) || defined(__gfx11__)
+    CK_TILE_DEVICE static constexpr index_t kMaxVmCnt   = 0x3F; // 6 bits
+    CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0x3F; // 6 bits
+    CK_TILE_DEVICE static constexpr index_t kMaxExpCnt  = 0x0;  // none
 #else
-    // bit numbers (hex) -------------------------> FE'DC'BA98'7'654'3210
-    // [V]M [E]XP [L]GKM counters and [U]NUSED ---> VV'UU'LLLL'U'EEE'VVVV
-    CK_TILE_DEVICE static constexpr index_t MAX = 0b11'00'1111'0'111'1111;
-
-    CK_TILE_DEVICE static constexpr index_t kMaxVmCnt   = 0b111111;
-    CK_TILE_DEVICE static constexpr index_t kMaxExpCnt  = 0b111;
-    CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0b1111;
+    CK_TILE_DEVICE static constexpr index_t kMaxVmCnt   = 0x3F; // 6 bits (split)
+    CK_TILE_DEVICE static constexpr index_t kMaxLgkmCnt = 0x0F; // 4 bits
+    CK_TILE_DEVICE static constexpr index_t kMaxExpCnt  = 0x07; // 3 bits
+#endif
 
     template <index_t cnt>
     CK_TILE_DEVICE static constexpr index_t from_vmcnt()
     {
-        static_assert(cnt >= 0 && !(cnt >> 6), "valid range is [0..63]");
-        return MAX & ((cnt & 0b1111) | ((cnt & 0b110000) << 10));
-    }
-
-    template <index_t cnt>
-    CK_TILE_DEVICE static constexpr index_t from_expcnt()
-    {
-        static_assert(cnt >= 0 && !(cnt >> 3), "valid range is [0..7]");
-        return MAX & (cnt << 4);
+        static_assert((cnt & ~Waitcnt::VM_MASK) == 0, "vmcnt out of range");
+        return Waitcnt::pack_vm(cnt);
     }
 
     template <index_t cnt>
     CK_TILE_DEVICE static constexpr index_t from_lgkmcnt()
     {
-        static_assert(cnt >= 0 && !(cnt >> 4), "valid range is [0..15]");
-        return MAX & (cnt << 8);
+        static_assert((cnt & ~Waitcnt::LGKM_MASK) == 0, "lgkmcnt out of range");
+        return Waitcnt::pack_lgkm(cnt);
     }
+
+    template <index_t cnt>
+    CK_TILE_DEVICE static constexpr index_t from_expcnt()
+    {
+        if constexpr(Waitcnt::HAS_EXP)
+        {
+            // EXP_MASK only exists on legacy
+#if !defined(__gfx12__) && !defined(__gfx11__)
+            static_assert((cnt & ~Waitcnt::EXP_MASK) == 0, "expcnt out of range");
+            return Waitcnt::pack_exp(cnt);
+#else
+            (void)cnt;
+            return 0;
 #endif
+        }
+        else
+        {
+            static_assert(cnt == 0, "expcnt unsupported on this arch");
+            return 0;
+        }
+    }
 };
 
 template <index_t vmcnt   = waitcnt_arg::kMaxVmCnt,
@@ -262,12 +299,12 @@ CK_TILE_DEVICE void s_nop(index_t cnt = 0)
 #endif
 }
 
-#define CK_CONSTANT_ADDRESS_SPACE \
-    __attribute__((address_space( \
+#define CK_TILE_CONSTANT_ADDRESS_SPACE \
+    __attribute__((address_space(      \
         static_cast<safe_underlying_type_t<address_space_enum>>(address_space_enum::constant))))
 
 template <typename T>
-__device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE* p)
+__device__ T* cast_pointer_to_generic_address_space(T CK_TILE_CONSTANT_ADDRESS_SPACE* p)
 {
     // cast a pointer in "Constant" address space (4) to "Generic" address space (0)
     // only c-style pointer cast seems be able to be compiled
@@ -278,13 +315,13 @@ __device__ T* cast_pointer_to_generic_address_space(T CK_CONSTANT_ADDRESS_SPACE*
 }
 
 template <typename T>
-__host__ __device__ T CK_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
+__host__ __device__ T CK_TILE_CONSTANT_ADDRESS_SPACE* cast_pointer_to_constant_address_space(T* p)
 {
     // cast a pointer in "Generic" address space (0) to "Constant" address space (4)
     // only c-style pointer cast seems be able to be compiled;
 #pragma clang diagnostic push
 #pragma clang diagnostic ignored "-Wold-style-cast"
-    return (T CK_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
+    return (T CK_TILE_CONSTANT_ADDRESS_SPACE*)p; // NOLINT(old-style-cast)
 #pragma clang diagnostic pop
 }
 
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index cfec2237f9..1a88a98cbf 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -214,6 +214,17 @@ CK_TILE_HOST_DEVICE static void print(const sequence<Is...>&)
     printf(">");
 }
 
+template <typename T>
+struct is_sequence : std::false_type
+{
+};
+template <index_t... Is>
+struct is_sequence<sequence<Is...>> : std::true_type
+{
+};
+template <typename T>
+inline constexpr bool is_sequence_v = is_sequence<T>::value;
+
 namespace impl {
 template <typename T, T... Ints>
 struct __integer_sequence;
diff --git a/include/ck_tile/core/tensor/load_tile.hpp b/include/ck_tile/core/tensor/load_tile.hpp
index 2e9ab0f5c6..1be4259e97 100644
--- a/include/ck_tile/core/tensor/load_tile.hpp
+++ b/include/ck_tile/core/tensor/load_tile.hpp
@@ -17,6 +17,19 @@
 #include "ck_tile/core/tensor/null_tensor.hpp"
 
 namespace ck_tile {
+// Per-lane read-offset tweaks allow swizzling patterns not representable by tile_distribution.
+template <typename TileWindow_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true,
+          typename                   = std::enable_if_t<std::is_class_v<TileWindow_>>>
+CK_TILE_DEVICE auto load_tile_with_offset(const TileWindow_& tile_window,
+                                          index_t offset,
+                                          number<i_access>                     = {},
+                                          bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load_with_offset(
+        offset, number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
 
 template <typename TileWindow_, index_t i_access = -1, bool oob_conditional_check = true>
 CK_TILE_DEVICE auto load_tile(const TileWindow_& tile_window,
@@ -49,6 +62,23 @@ CK_TILE_DEVICE auto load_tile_with_elementwise(const TileWindow_& tile_window,
         tile_window, elementwise, number<i_access>{}, bool_constant<oob_conditional_check>{});
 }
 
+// Per-lane read-offset tweaks allow swizzling patterns not representable by tile_distribution.
+template <typename DistributedTensor_,
+          typename TileWindow_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true,
+          typename = std::enable_if_t<std::is_class_v<std::remove_cv_t<DistributedTensor_>> &&
+                                      std::is_class_v<TileWindow_>>>
+CK_TILE_DEVICE auto load_tile_with_offset(DistributedTensor_& dst_tile,
+                                          const TileWindow_& tile_window,
+                                          index_t offset,
+                                          number<i_access>                     = {},
+                                          bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.load_with_offset(
+        offset, dst_tile, number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
+
 template <typename DistributedTensor_,
           typename TileWindow_,
           index_t i_access           = -1,
@@ -112,6 +142,23 @@ CK_TILE_DEVICE auto load_tile_raw(T& tile,
         tile, number<i_access>{}, bool_constant<oob_conditional_check>{}, bool_constant<pre_nop>{});
 }
 
+// Per-lane read-offset tweaks allow swizzling patterns not representable by tile_distribution.
+template <typename LdsTileWindow_,
+          typename TileWindow_,
+          index_t i_access           = -1,
+          bool oob_conditional_check = true,
+          typename = std::enable_if_t<std::is_class_v<remove_cvref_t<LdsTileWindow_>> &&
+                                      std::is_class_v<TileWindow_>>>
+CK_TILE_DEVICE auto async_load_tile_with_offset(LdsTileWindow_&& lds_tile,
+                                                const TileWindow_& tile_window,
+                                                index_t offset,
+                                                number<i_access>                     = {},
+                                                bool_constant<oob_conditional_check> = {})
+{
+    return tile_window.async_load_with_offset(
+        offset, lds_tile, number<i_access>{}, bool_constant<oob_conditional_check>{});
+}
+
 template <typename LdsTileWindow_,
           typename TileWindow_,
           index_t i_access           = -1,
@@ -121,8 +168,8 @@ CK_TILE_DEVICE auto async_load_tile(LdsTileWindow_&& lds_tile,
                                     number<i_access>                     = {},
                                     bool_constant<oob_conditional_check> = {})
 {
-    return tile_window.async_load(
-        lds_tile, number<i_access>{}, bool_constant<oob_conditional_check>{});
+    return async_load_tile_with_offset(
+        lds_tile, tile_window, 0, number<i_access>{}, bool_constant<oob_conditional_check>{});
 }
 
 template <typename LdsTileWindow_,
diff --git a/include/ck_tile/core/tensor/load_tile_transpose.hpp b/include/ck_tile/core/tensor/load_tile_transpose.hpp
index 1535250722..fb645f89e9 100644
--- a/include/ck_tile/core/tensor/load_tile_transpose.hpp
+++ b/include/ck_tile/core/tensor/load_tile_transpose.hpp
@@ -381,6 +381,8 @@ CK_TILE_HOST_DEVICE constexpr auto InputTileDistributionEncoding()
  * the last is SFINAE to ensure the tile distribution encoding is valid.
  *
  * @param tile_window             The tile window with static distribution to load and transpose.
+ * @param offset                  The offset (in elements) added to the base address before
+ * indexing.
  *
  * @return A statically distributed tensor containing the transposed tile data.
  *
@@ -399,18 +401,19 @@ template <
                                                                  typename BottomTensorView_::DataType,
                                                                  Policy>::distr_encoding_valid,
                                        Policy>>
-CK_TILE_DEVICE auto
-load_tile_transpose(const tile_window_with_static_distribution<BottomTensorView_,
-                                                               WindowLengths_,
-                                                               TileDistribution_,
-                                                               NumCoord>& tile_window)
+CK_TILE_DEVICE auto load_tile_transpose_with_offset(
+    const tile_window_with_static_distribution<BottomTensorView_,
+                                               WindowLengths_,
+                                               TileDistribution_,
+                                               NumCoord>& __restrict__ tile_window,
+    index_t offset)
 {
     using OutTileDstrEncode = typename OutputTileDistributionTraits<
         typename TileDistribution_::DstrEncode,
         typename BottomTensorView_::DataType>::TransposedDstrEncode;
     auto out_tensor = make_static_distributed_tensor<typename BottomTensorView_::DataType>(
         make_static_tile_distribution(OutTileDstrEncode{}));
-    auto trans_tensor           = tile_window.template load_transpose<Policy>();
+    auto trans_tensor           = tile_window.template load_transpose_with_offset<Policy>(offset);
     constexpr auto input_distr  = TileDistribution_{};
     constexpr auto output_distr = make_static_tile_distribution(OutTileDstrEncode{});
 
@@ -443,4 +446,49 @@ load_tile_transpose(const tile_window_with_static_distribution<BottomTensorView_
     return out_tensor;
 }
 
+/**
+ * @brief transpose loads tile from a tensor and returns the resulting tensor with a new
+ * (transposed) tile distribution. use SFINAE to ensure the tile distribution encoding is valid.
+ *
+ * This function is intended for use with statically distributed tensor tiles, where the input
+ * and output tile distributions differ due to the transpose operation. It ensures that the
+ * element space size and vector length remain consistent between the input and output
+ * distributions.
+ *
+ * @tparam BottomTensorView_      The type of the bottom tensor view.
+ * @tparam WindowLengths_         The type representing the window lengths.
+ * @tparam TileDistribution_      The type representing the tile distribution.
+ * @tparam NumCoord               The number of coordinates (dimensions).
+ * @tparam Policy                 The transpose policy to use (defaults to DefaultTranspose).
+ * the last is SFINAE to ensure the tile distribution encoding is valid.
+ *
+ * @param tile_window             The tile window with static distribution to load and transpose.
+ * indexing.
+ *
+ * @return A statically distributed tensor containing the transposed tile data.
+ *
+ * @note
+ * - The function uses compile-time checks to ensure the input and output tile distributions
+ *   are compatible in terms of element space size and vector length.
+ * - The transpose operation is performed according to the specified Policy.
+ */
+template <
+    typename BottomTensorView_,
+    typename WindowLengths_,
+    typename TileDistribution_,
+    index_t NumCoord,
+    typename Policy = DefaultTranspose<typename BottomTensorView_::DataType>,
+    typename        = std::enable_if_t<TransposeTileDistrChecker<TileDistribution_,
+                                                                 typename BottomTensorView_::DataType,
+                                                                 Policy>::distr_encoding_valid,
+                                       Policy>>
+CK_TILE_DEVICE auto
+load_tile_transpose(const tile_window_with_static_distribution<BottomTensorView_,
+                                                               WindowLengths_,
+                                                               TileDistribution_,
+                                                               NumCoord>& __restrict__ tile_window)
+{
+    return load_tile_transpose_with_offset(tile_window, 0);
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
index b73a27c8d5..5228ad978a 100644
--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -155,11 +155,11 @@ CK_TILE_HOST_DEVICE constexpr auto make_static_distributed_tensor(const StaticTi
 
 // get X indices from tuple of tile_distributed_index<>
 template <typename StaticTileDistribution, typename DistributedIndices>
-CK_TILE_HOST_DEVICE constexpr auto
-get_x_indices_from_distributed_indices(StaticTileDistribution tile_distribution,
-                                       DistributedIndices distributed_indices)
+CK_TILE_HOST_DEVICE constexpr auto get_x_indices_from_distributed_indices(
+    StaticTileDistribution tile_distribution,
+    DistributedIndices distributed_indices,
+    decltype(get_partition_index(tile_distribution)) partition_index)
 {
-    const auto partition_index = detail::get_partition_index(tile_distribution);
     constexpr auto y_indices =
         tile_distribution.get_y_indices_from_distributed_indices(distributed_indices);
 
@@ -170,6 +170,16 @@ get_x_indices_from_distributed_indices(StaticTileDistribution tile_distribution,
     return x_coord.get_bottom_index();
 }
 
+// get X indices from tuple of tile_distributed_index<>
+template <typename StaticTileDistribution, typename DistributedIndices>
+CK_TILE_HOST_DEVICE constexpr auto
+get_x_indices_from_distributed_indices(StaticTileDistribution tile_distribution,
+                                       DistributedIndices distributed_indices)
+{
+    return get_x_indices_from_distributed_indices(
+        tile_distribution, distributed_indices, get_partition_index(tile_distribution));
+}
+
 template <typename DataType, typename StaticTileDistribution, typename XIndicesPredicate>
 CK_TILE_HOST_DEVICE void
 set_tile_if(static_distributed_tensor<DataType, StaticTileDistribution>& out_tensor,
@@ -192,6 +202,29 @@ set_tile_if(static_distributed_tensor<DataType, StaticTileDistribution>& out_ten
     });
 }
 
+template <typename DataType, typename StaticTileDistribution, typename XIndicesPredicate>
+CK_TILE_HOST_DEVICE void
+set_tile_if(static_distributed_tensor<DataType, StaticTileDistribution>& out_tensor,
+            DataType value,
+            XIndicesPredicate predicate,
+            decltype(get_partition_index(std::declval<StaticTileDistribution>())) partition_index)
+{
+    constexpr auto out_spans =
+        static_distributed_tensor<DataType, StaticTileDistribution>::get_distributed_spans();
+    sweep_tile_span(out_spans[number<0>{}], [&](auto idx0) {
+        sweep_tile_span(out_spans[number<1>{}], [&](auto idx1) {
+            constexpr auto distributed_indices = make_tuple(idx0, idx1);
+            const auto x_indices               = get_x_indices_from_distributed_indices(
+                StaticTileDistribution{}, distributed_indices, partition_index);
+
+            if(predicate(x_indices))
+            {
+                out_tensor(distributed_indices) = value;
+            }
+        });
+    });
+}
+
 // this function used inside span loop over
 template <typename YLengths, index_t XUnpacks>
 CK_TILE_HOST_DEVICE constexpr auto get_y_unpacks_from_x_unpacks(YLengths, number<XUnpacks>)
diff --git a/include/ck_tile/core/tensor/store_tile.hpp b/include/ck_tile/core/tensor/store_tile.hpp
index d5a716664d..b535b40534 100644
--- a/include/ck_tile/core/tensor/store_tile.hpp
+++ b/include/ck_tile/core/tensor/store_tile.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/core/algorithm/coordinate_transform.hpp"
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/numeric/math.hpp"
+#include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/tensor/tile_window.hpp"
 #include "ck_tile/core/tensor/tile_window_linear.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
@@ -38,6 +39,31 @@ store_tile(tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>& t
     tile_window.store(dstr_tensor);
 }
 
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename DataType_>
+CK_TILE_DEVICE void
+store_tile(tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>& tile_window_tmp,
+           const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor,
+           decltype(get_partition_index(dstr_tensor.get_tile_distribution())) partition_index)
+{
+    using DataType = remove_cvref_t<typename BottomTensorView_::DataType>;
+    using TileDstr = remove_cvref_t<TileDistribution_>;
+
+    static_assert(std::is_same_v<remove_cvref_t<DataType_>, DataType>, "wrong!");
+
+    constexpr auto tile_dstr = TileDstr{};
+
+    auto tile_window = make_tile_window(tile_window_tmp.get_bottom_tensor_view(),
+                                        tile_window_tmp.get_window_lengths(),
+                                        tile_window_tmp.get_window_origin(),
+                                        tile_dstr,
+                                        partition_index);
+
+    tile_window.store(dstr_tensor);
+}
+
 template <typename BottomTensorView_,
           typename WindowLengths_,
           typename TileDistribution_,
@@ -61,6 +87,31 @@ store_tile_raw(tile_window_with_static_lengths<BottomTensorView_, WindowLengths_
     tile_window.store_raw(dstr_tensor);
 }
 
+template <typename BottomTensorView_,
+          typename WindowLengths_,
+          typename TileDistribution_,
+          typename DataType_>
+CK_TILE_DEVICE void
+store_tile_raw(tile_window_with_static_lengths<BottomTensorView_, WindowLengths_>& tile_window_tmp,
+               const static_distributed_tensor<DataType_, TileDistribution_>& dstr_tensor,
+               decltype(get_partition_index(dstr_tensor.get_tile_distribution())) partition_index)
+{
+    using DataType = remove_cvref_t<typename BottomTensorView_::DataType>;
+    using TileDstr = remove_cvref_t<TileDistribution_>;
+
+    static_assert(std::is_same_v<remove_cvref_t<DataType_>, DataType>, "wrong!");
+
+    constexpr auto tile_dstr = TileDstr{};
+
+    auto tile_window = make_tile_window(tile_window_tmp.get_bottom_tensor_view(),
+                                        tile_window_tmp.get_window_lengths(),
+                                        tile_window_tmp.get_window_origin(),
+                                        tile_dstr,
+                                        partition_index);
+
+    tile_window.store_raw(dstr_tensor);
+}
+
 template <typename BottomTensorView_,
           typename WindowLengths_,
           typename TileDistribution_,
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index fb209ba827..7dd2684347 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -444,6 +444,21 @@ struct null_tensor_view
 {
 };
 
+template <typename T>
+struct is_tensor_view : std::false_type
+{
+};
+template <typename BufferView, typename TensorDesc, memory_operation_enum DstInMemOp>
+struct is_tensor_view<tensor_view<BufferView, TensorDesc, DstInMemOp>> : std::true_type
+{
+};
+template <>
+struct is_tensor_view<null_tensor_view> : std::true_type
+{
+};
+template <typename T>
+inline constexpr bool is_tensor_view_v = is_tensor_view<T>::value;
+
 template <address_space_enum BufferAddressSpace = address_space_enum::generic,
           memory_operation_enum DstInMemOp      = memory_operation_enum::set,
           amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp
index bc02ec74d2..52a5281cbe 100644
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -17,13 +17,11 @@
 
 namespace ck_tile {
 
-namespace detail {
 template <typename Distribution>
 CK_TILE_HOST_DEVICE auto get_partition_index(Distribution)
 {
-    return Distribution::_get_partition_index();
+    return Distribution::get_partition_index();
 }
-} // namespace detail
 
 // distributed span
 template <index_t... PartialHsLengths>
@@ -91,7 +89,7 @@ struct tile_distribution
     CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_p() { return NDimP; }
     CK_TILE_HOST_DEVICE static constexpr index_t get_num_of_dimension_r() { return NDimR; }
 
-    CK_TILE_HOST_DEVICE static auto _get_partition_index()
+    CK_TILE_HOST_DEVICE static auto get_partition_index()
     {
         // only support warp-tile and block-tile
         static_assert(NDimP == 1 or NDimP == 2, "wrong!");
@@ -172,9 +170,9 @@ struct tile_distribution
     }
 #endif
 
-    template <typename PartitionIndex = decltype(_get_partition_index())>
+    template <typename PartitionIndex = decltype(get_partition_index())>
     CK_TILE_HOST_DEVICE auto
-    calculate_index(const PartitionIndex& ps_idx = _get_partition_index()) const
+    calculate_index(const PartitionIndex& ps_idx = get_partition_index()) const
     {
         const auto ps_ys_idx = container_concat(ps_idx, array<index_t, NDimY>{0});
         const auto window_adaptor_thread_coord_tmp =
@@ -230,6 +228,23 @@ struct tile_distribution
     }
 };
 
+template <typename T>
+struct is_tile_distribution : std::false_type
+{
+};
+template <typename PsYs2XsAdaptor,
+          typename Ys2DDescriptor,
+          typename StaticTileDistributionEncoding,
+          typename TileDistributionDetail>
+struct is_tile_distribution<tile_distribution<PsYs2XsAdaptor,
+                                              Ys2DDescriptor,
+                                              StaticTileDistributionEncoding,
+                                              TileDistributionDetail>> : std::true_type
+{
+};
+template <typename T>
+inline constexpr bool is_tile_distribution_v = is_tile_distribution<T>::value;
+
 namespace detail {
 
 template <index_t NDimMax>
diff --git a/include/ck_tile/core/tensor/tile_scatter_gather.hpp b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
index 4b04fd513d..e77ca805bb 100644
--- a/include/ck_tile/core/tensor/tile_scatter_gather.hpp
+++ b/include/ck_tile/core/tensor/tile_scatter_gather.hpp
@@ -189,8 +189,7 @@ struct tile_scatter_gather
         // need investigation
         const auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
             tile_distribution.get_ps_ys_to_xs_adaptor(),
-            container_concat(detail::get_partition_index(tile_distribution),
-                             array<index_t, NDimY>{0}));
+            container_concat(get_partition_index(tile_distribution), array<index_t, NDimY>{0}));
 #endif
 
         BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
@@ -836,7 +835,7 @@ struct tile_scatter_gather
         // need investigation
         const auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
             tile_dstr_.get_ps_ys_to_xs_adaptor(),
-            container_concat(detail::get_partition_index(tile_dstr_), array<index_t, NDimY>{0}));
+            container_concat(get_partition_index(tile_dstr_), array<index_t, NDimY>{0}));
 #endif
 
         BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index cfa2420f2f..1123ce7604 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -12,6 +12,7 @@
 #include "ck_tile/core/container/container_helper.hpp"
 #include "ck_tile/core/tensor/static_distributed_tensor.hpp"
 #include "ck_tile/core/tensor/tensor_adaptor.hpp"
+#include "ck_tile/core/tensor/tensor_view.hpp"
 #include "ck_tile/core/tensor/tile_distribution.hpp"
 #include "ck_tile/core/tensor/tile_window_base.hpp"
 #include "ck_tile/core/utility/functional.hpp"
@@ -67,18 +68,54 @@ struct tile_window_with_static_distribution
         const typename Base::BottomTensorView& bottom_tensor_view,
         const typename Base::WindowLengths& window_lengths,
         const typename Base::BottomTensorIndex& window_origin,
-        const typename Base::TileDstr& tile_distribution)
+        const typename Base::TileDstr& tile_distribution,
+        decltype(get_partition_index(tile_distribution)) partition_index)
         : pre_computed_coords_{}
     {
 
-        this->window_origin_                       = window_origin;
-        this->window_lengths_                      = window_lengths;
-        this->bottom_tensor_view_                  = bottom_tensor_view;
-        this->tile_dstr_                           = tile_distribution;
+        this->window_origin_      = window_origin;
+        this->window_lengths_     = window_lengths;
+        this->bottom_tensor_view_ = bottom_tensor_view;
+        this->tile_dstr_          = tile_distribution;
+
+        pre_computed_coords_ =
+            prepare_coords(bottom_tensor_view, window_origin, tile_distribution, partition_index);
+        if constexpr(Base::BottomTensorView::buffer_view::get_address_space() ==
+                     address_space_enum::global)
+        {
+            auto use_lane_id_0 = partition_index;
+            use_lane_id_0[1]   = 0;
+
+            pre_computed_warp_coords_ =
+                prepare_coords(bottom_tensor_view, window_origin, tile_distribution, use_lane_id_0);
+        }
+    }
+
+    CK_TILE_DEVICE constexpr tile_window_with_static_distribution(
+        const typename Base::BottomTensorView& bottom_tensor_view,
+        const typename Base::WindowLengths& window_lengths,
+        const typename Base::BottomTensorIndex& window_origin,
+        const typename Base::TileDstr& tile_distribution)
+        : tile_window_with_static_distribution(bottom_tensor_view,
+                                               window_lengths,
+                                               window_origin,
+                                               tile_distribution,
+                                               get_partition_index(tile_distribution))
+    {
+    }
+
+    CK_TILE_DEVICE constexpr auto
+    prepare_coords(const typename Base::BottomTensorView& bottom_tensor_view,
+                   const typename Base::BottomTensorIndex& window_origin,
+                   const typename Base::TileDstr& tile_distribution,
+                   decltype(get_partition_index(tile_distribution)) partition_index) const
+    {
+        array<tuple<typename Base::WindowAdaptorCoord, typename Base::BottomTensorCoord>, NumCoord>
+            coords;
+
         const auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
             tile_distribution.get_ps_ys_to_xs_adaptor(),
-            container_concat(detail::get_partition_index(tile_distribution),
-                             array<index_t, Base::NDimY>{0}));
+            container_concat(partition_index, multi_index<Base::NDimY>{0}));
 
         typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
             window_origin + window_adaptor_thread_coord_tmp.get_bottom_index();
@@ -105,18 +142,31 @@ struct tile_window_with_static_distribution
             Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                 window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
 
-            pre_computed_coords_(iCoord) =
-                make_tuple(window_adaptor_thread_coord, bottom_tensor_thread_coord);
+            coords(iCoord) = make_tuple(window_adaptor_thread_coord, bottom_tensor_thread_coord);
         });
+
+        return coords;
     }
 
     template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
     CK_TILE_DEVICE auto load(number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
+    {
+        return load_with_offset(
+            0, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
+    }
+
+    template <index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load_with_offset(index_t offset,
+                                         number<i_access_unsupport_>          = {},
+                                         bool_constant<oob_conditional_check> = {}) const
     {
         constexpr auto tile_dstr = typename Base::TileDstr{};
         auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
-        load(dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
+        load_with_offset(offset,
+                         dst_tensor,
+                         number<i_access_unsupport_>{},
+                         bool_constant<oob_conditional_check>{});
         return dst_tensor;
     }
 
@@ -236,6 +286,19 @@ struct tile_window_with_static_distribution
     CK_TILE_DEVICE auto load(DistributedTensor& dst_tensor,
                              number<i_access_unsupport_>          = {},
                              bool_constant<oob_conditional_check> = {}) const
+    {
+        load_with_offset(
+            0, dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
+    }
+
+    template <typename DistributedTensor,
+              index_t i_access_unsupport_ = -1,
+              bool oob_conditional_check  = true,
+              typename = std::enable_if_t<std::is_class_v<std::remove_cv_t<DistributedTensor>>>>
+    CK_TILE_DEVICE auto load_with_offset(index_t offset,
+                                         DistributedTensor& dst_tensor,
+                                         number<i_access_unsupport_>          = {},
+                                         bool_constant<oob_conditional_check> = {}) const
     {
         using Traits   = typename Base::Traits;
         using vector_t = typename Traits::vector_t;
@@ -258,7 +321,7 @@ struct tile_window_with_static_distribution
                 // read from bottom tensor
                 const vector_t vec_value =
                     this->get_bottom_tensor_view().template get_vectorized_elements<vector_t>(
-                        bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
+                        bottom_tensor_thread_coord, offset, bool_constant<oob_conditional_check>{});
                 // write into distributed tensor
                 static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                     constexpr auto idx_ys = generate_tuple(
@@ -450,10 +513,12 @@ struct tile_window_with_static_distribution
 
     template <typename LdsTileWindow_,
               index_t i_access_unsupport_ = -1,
-              bool oob_conditional_check  = true>
-    CK_TILE_DEVICE auto async_load(LdsTileWindow_&& lds_tile,
-                                   number<i_access_unsupport_>          = {},
-                                   bool_constant<oob_conditional_check> = {}) const
+              bool oob_conditional_check  = true,
+              typename = std::enable_if_t<std::is_class_v<remove_cvref_t<LdsTileWindow_>>>>
+    CK_TILE_DEVICE auto async_load_with_offset(index_t offset,
+                                               LdsTileWindow_&& lds_tile,
+                                               number<i_access_unsupport_>          = {},
+                                               bool_constant<oob_conditional_check> = {}) const
     {
         using LdsTileWindow = remove_cvref_t<LdsTileWindow_>;
         using LdsDataType   = typename LdsTileWindow::DataType;
@@ -472,12 +537,15 @@ struct tile_window_with_static_distribution
             auto window_adaptor_thread_coord = pre_computed_coords_[iCoord][I0];
             auto bottom_tensor_thread_coord  = pre_computed_coords_[iCoord][I1];
 
+            auto window_adaptor_warp_coord = pre_computed_warp_coords_[iCoord][I0];
+            auto bottom_tensor_warp_coord  = pre_computed_warp_coords_[iCoord][I1];
+
             static_for<0, NumAccessPerCoord, 1>{}([&](auto iCoordAccess) {
                 constexpr auto iAccess = number<iCoord * NumAccessPerCoord + iCoordAccess>{};
 
                 // Use precomputed window origin
                 auto lds_bottom_tensor_thread_idx =
-                    window_origin + window_adaptor_thread_coord.get_bottom_index();
+                    window_origin + window_adaptor_warp_coord.get_bottom_index();
 
                 // Use precomputed tensor descriptor
                 const auto lds_coord =
@@ -490,7 +558,7 @@ struct tile_window_with_static_distribution
                 this->get_bottom_tensor_view().template async_get_vectorized_elements<vector_t>(
                     smem,
                     bottom_tensor_thread_coord,
-                    number<0>{},
+                    offset,
                     bool_constant<oob_conditional_check>{});
 
                 // Move thread coordinate if not last access
@@ -503,18 +571,33 @@ struct tile_window_with_static_distribution
 
                     Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
                         window_adaptor_thread_coord, bottom_tensor_thread_coord, idx_diff_ps_ys);
+
+                    Base::move_window_adaptor_and_bottom_tensor_thread_coordinate(
+                        window_adaptor_warp_coord, bottom_tensor_warp_coord, idx_diff_ps_ys);
                 }
             });
         });
     }
 
     template <typename Policy, index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
-    CK_TILE_DEVICE auto load_transpose() const
+    CK_TILE_DEVICE auto load_transpose(number<i_access_unsupport_>          = {},
+                                       bool_constant<oob_conditional_check> = {}) const
+    {
+        return this->template load_transpose_with_offset<Policy>(
+            0, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
+    }
+
+    template <typename Policy, index_t i_access_unsupport_ = -1, bool oob_conditional_check = true>
+    CK_TILE_DEVICE auto load_transpose_with_offset(index_t offset,
+                                                   number<i_access_unsupport_>          = {},
+                                                   bool_constant<oob_conditional_check> = {}) const
     {
         constexpr auto tile_dstr = typename Base::TileDstr{};
         auto dst_tensor = make_static_distributed_tensor<typename Base::DataType>(tile_dstr);
-        this->template load_transpose<Policy>(
-            dst_tensor, number<i_access_unsupport_>{}, bool_constant<oob_conditional_check>{});
+        this->template load_transpose_with_offset<Policy>(offset,
+                                                          dst_tensor,
+                                                          number<i_access_unsupport_>{},
+                                                          bool_constant<oob_conditional_check>{});
         return dst_tensor;
     }
 
@@ -522,9 +605,10 @@ struct tile_window_with_static_distribution
               typename DistributedTensor,
               index_t i_access_unsupport_ = -1,
               bool oob_conditional_check  = true>
-    CK_TILE_DEVICE auto load_transpose(DistributedTensor& dst_tensor,
-                                       number<i_access_unsupport_>          = {},
-                                       bool_constant<oob_conditional_check> = {}) const
+    CK_TILE_DEVICE auto load_transpose_with_offset(index_t offset,
+                                                   DistributedTensor& dst_tensor,
+                                                   number<i_access_unsupport_>          = {},
+                                                   bool_constant<oob_conditional_check> = {}) const
     {
         using Traits   = typename Base::Traits;
         using vector_t = typename Traits::vector_t;
@@ -550,7 +634,7 @@ struct tile_window_with_static_distribution
                 const vector_t vec_value =
                     this->get_bottom_tensor_view()
                         .template get_transpose_vectorized_elements<vector_t>(
-                            bottom_tensor_thread_coord, 0);
+                            bottom_tensor_thread_coord, offset);
                 // write into distributed tensor
                 static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
                     constexpr auto orig_idx_ys = generate_tuple(
@@ -862,16 +946,26 @@ struct tile_window_with_static_distribution
                                    pre_computed_coords_(iCoord)(I1),
                                    step);
         });
+
+        if constexpr(Base::BottomTensorView::buffer_view::get_address_space() ==
+                     address_space_enum::global)
+        {
+            static_for<0, NumCoord, 1>{}([&](auto iCoord) {
+                move_tensor_coordinate(this->bottom_tensor_view_.get_tensor_descriptor(),
+                                       pre_computed_warp_coords_(iCoord)(I1),
+                                       step);
+            });
+        }
     }
 
     CK_TILE_DEVICE void set_window_origin_extended(const typename Base::BottomTensorIndex&)
     {
         // TODO: this use less register for FA, but more register for GEMM
         // need investigation
-        const auto window_adaptor_thread_coord_tmp = make_tensor_adaptor_coordinate(
-            this->tile_dstr_.get_ps_ys_to_xs_adaptor(),
-            container_concat(detail::get_partition_index(this->tile_dstr_),
-                             array<index_t, Base::NDimY>{0}));
+        const auto window_adaptor_thread_coord_tmp =
+            make_tensor_adaptor_coordinate(this->tile_dstr_.get_ps_ys_to_xs_adaptor(),
+                                           container_concat(get_partition_index(this->tile_dstr_),
+                                                            array<index_t, Base::NDimY>{0}));
 
         typename Base::BottomTensorIndex bottom_tensor_thread_origin_idx_tmp =
             this->window_origin_ + window_adaptor_thread_coord_tmp.get_bottom_index();
@@ -908,6 +1002,12 @@ struct tile_window_with_static_distribution
     //   per-thread coordinate for bottom tensor
     array<tuple<typename Base::WindowAdaptorCoord, typename Base::BottomTensorCoord>, NumCoord>
         pre_computed_coords_;
+    // pre_computed_warp_coords_ exists only in the global memory tile_window
+    std::conditional_t<
+        Base::BottomTensorView::buffer_view::get_address_space() == address_space_enum::global,
+        array<tuple<typename Base::WindowAdaptorCoord, typename Base::BottomTensorCoord>, NumCoord>,
+        std::byte>
+        pre_computed_warp_coords_;
 };
 
 // TODO: use strategy
@@ -929,6 +1029,27 @@ make_tile_window(const TensorView_& tensor_view,
         tensor_view, window_lengths, origin, tile_distribution};
 }
 
+template <typename TensorView_,
+          typename WindowLengths_,
+          typename StaticTileDistribution_,
+          index_t NumCoord = 1,
+          typename         = std::enable_if_t<is_tensor_view_v<TensorView_> &&
+                                              is_tile_distribution_v<StaticTileDistribution_>>>
+CK_TILE_DEVICE constexpr auto
+make_tile_window(const TensorView_& tensor_view,
+                 const WindowLengths_& window_lengths,
+                 const multi_index<TensorView_::get_num_of_dimension()>& origin,
+                 const StaticTileDistribution_& tile_distribution,
+                 decltype(get_partition_index(tile_distribution)) partition_index,
+                 number<NumCoord> = {})
+{
+    return tile_window_with_static_distribution<remove_cvref_t<TensorView_>,
+                                                remove_cvref_t<WindowLengths_>,
+                                                remove_cvref_t<StaticTileDistribution_>,
+                                                NumCoord>{
+        tensor_view, window_lengths, origin, tile_distribution, partition_index};
+}
+
 // this version can't be called in a constexpr context
 template <typename TensorView_,
           typename WindowLengths_,
@@ -1131,15 +1252,25 @@ make_tile_window(const tile_window_with_static_lengths<TensorView, WindowLengths
                             tile_distribution);
 }
 
+template <typename TensorView, typename WindowLengths, typename StaticTileDistribution>
+CK_TILE_DEVICE constexpr auto
+make_tile_window(const tile_window_with_static_lengths<TensorView, WindowLengths>& tile_window,
+                 const StaticTileDistribution& tile_distribution,
+                 decltype(get_partition_index(tile_distribution)) partition_index)
+{
+    return make_tile_window(tile_window.get_bottom_tensor_view(),
+                            tile_window.get_window_lengths(),
+                            tile_window.get_window_origin(),
+                            tile_distribution,
+                            partition_index);
+}
+
 template <typename TensorView, typename WindowLengths, typename StaticTileDistribution>
 CK_TILE_DEVICE constexpr auto
 make_tile_window_raw(const tile_window_with_static_lengths<TensorView, WindowLengths>& tile_window,
                      const StaticTileDistribution& tile_distribution)
 {
-    auto w = make_tile_window(tile_window.get_bottom_tensor_view(),
-                              tile_window.get_window_lengths(),
-                              tile_window.get_window_origin(),
-                              tile_distribution);
+    auto w = make_tile_window(tile_window, tile_distribution);
     w.init_raw();
     return w;
 }
diff --git a/include/ck_tile/core/utility/static_counter.hpp b/include/ck_tile/core/utility/static_counter.hpp
index 84af3dd52f..4828e2e010 100644
--- a/include/ck_tile/core/utility/static_counter.hpp
+++ b/include/ck_tile/core/utility/static_counter.hpp
@@ -102,11 +102,14 @@ struct static_counter_uniq_;
 }
 
 #define MAKE_SC() \
-    ck_tile::static_counter<ck_tile::impl::static_counter_uniq_<__COUNTER__>> {}
-#define MAKE_SC_WITH(start_, step_) \
-    ck_tile::static_counter<ck_tile::impl::static_counter_uniq_<__COUNTER__>, start_, step_> {}
-#define NEXT_SC(c_) c_.next<__COUNTER__>()
-#define NEXT_SCI(c_, static_i_) c_.next<__COUNTER__ + static_i_>()
+    __extension__ ck_tile::static_counter<ck_tile::impl::static_counter_uniq_<__COUNTER__>> {}
+#define MAKE_SC_WITH(start_, step_)                                                     \
+    __extension__ ck_tile::                                                             \
+        static_counter<ck_tile::impl::static_counter_uniq_<__COUNTER__>, start_, step_> \
+    {                                                                                   \
+    }
+#define NEXT_SC(c_) __extension__ c_.next<__COUNTER__>()
+#define NEXT_SCI(c_, static_i_) __extension__ c_.next<__COUNTER__ + static_i_>()
 
 // Usage:
 // constexpr auto c = MAKE_SC()
diff --git a/include/ck_tile/host/kernel_launch.hpp b/include/ck_tile/host/kernel_launch.hpp
index 9ac0b5ba0e..be38e92b1a 100644
--- a/include/ck_tile/host/kernel_launch.hpp
+++ b/include/ck_tile/host/kernel_launch.hpp
@@ -110,6 +110,10 @@ CK_TILE_HOST double timing_loop_impl(TimerType timer,
 {
     for(int i = 0; i < s.cold_niters_; i++)
     {
+        if constexpr(!std::is_same_v<PreprocessFunc, std::nullptr_t>)
+        {
+            preprocess();
+        }
         callables_func();
     }
     // Only profile preprocess if it's provided
diff --git a/include/ck_tile/host/tensor_shuffle_utils.hpp b/include/ck_tile/host/tensor_shuffle_utils.hpp
index f29f3eeed6..e3b5c96d91 100644
--- a/include/ck_tile/host/tensor_shuffle_utils.hpp
+++ b/include/ck_tile/host/tensor_shuffle_utils.hpp
@@ -24,16 +24,43 @@ template <typename GemmConfig, typename T>
 auto shuffle_b(const ck_tile::HostTensor<T>& t)
 {
     assert(t.get_lengths().size() == 2);
-    int n_                = t.get_lengths()[1];
-    int k_                = t.get_lengths()[0];
-    constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
-    ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
-                                   GemmConfig::N_Warp_Tile,
-                                   k_ / GemmConfig::K_Warp_Tile,
-                                   divisor,
-                                   GemmConfig::K_Warp_Tile / divisor});
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    int n_ = t.get_lengths()[1];
+    int k_ = t.get_lengths()[0];
+
+    if(ck_tile::is_gfx12_supported())
+    {
+        constexpr int divisor      = 2;
+        constexpr int kABK1PerLane = 8;
+        constexpr int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       kABK0PerLane,
+                                       divisor,
+                                       kABK1PerLane});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 4, 1, 3, 5});
+    }
+    else
+    {
+        int divisor = 1;
+        if(ck_tile::is_gfx11_supported())
+        {
+            divisor = 1;
+        }
+        else
+        {
+            assert(is_wave32() == false);
+            divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+        }
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Warp_Tile,
+                                       GemmConfig::N_Warp_Tile,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       divisor,
+                                       GemmConfig::K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 2, 3, 1, 4});
+    }
 }
 
 template <typename GemmConfig, typename T>
@@ -55,21 +82,46 @@ template <typename GemmConfig, typename T>
 auto shuffle_b_permuteN(const ck_tile::HostTensor<T>& t)
 {
     assert(t.get_lengths().size() == 2);
-
     int n_                = t.get_lengths()[1];
     int k_                = t.get_lengths()[0];
-    constexpr int divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
     constexpr int NRepeat = GemmConfig::N_Tile / GemmConfig::N_Warp_Tile / GemmConfig::N_Warp;
-
-    ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Tile,
-                                   GemmConfig::N_Warp,
-                                   GemmConfig::N_Warp_Tile,
-                                   NRepeat,
-                                   k_ / GemmConfig::K_Warp_Tile,
-                                   divisor,
-                                   GemmConfig::K_Warp_Tile / divisor});
-
-    std::copy(t.begin(), t.end(), t_view.begin());
-    return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
+    if(ck_tile::is_gfx12_supported())
+    {
+        constexpr int divisor      = 2;
+        constexpr int kABK1PerLane = 8;
+        constexpr int kABK0PerLane = GemmConfig::K_Warp_Tile / divisor / kABK1PerLane;
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Tile,
+                                       GemmConfig::N_Warp,
+                                       GemmConfig::N_Warp_Tile,
+                                       NRepeat,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       kABK0PerLane,
+                                       divisor,
+                                       kABK1PerLane});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 6, 5, 2, 7});
+    }
+    else
+    {
+        int divisor = 1;
+        if(ck_tile::is_gfx11_supported())
+        {
+            divisor = 1;
+        }
+        else
+        {
+            assert(is_wave32() == false);
+            divisor = GemmConfig::N_Warp_Tile == 32 ? 2 : 4;
+        }
+        ck_tile::HostTensor<T> t_view({n_ / GemmConfig::N_Tile,
+                                       GemmConfig::N_Warp,
+                                       GemmConfig::N_Warp_Tile,
+                                       NRepeat,
+                                       k_ / GemmConfig::K_Warp_Tile,
+                                       divisor,
+                                       GemmConfig::K_Warp_Tile / divisor});
+        std::copy(t.begin(), t.end(), t_view.begin());
+        return ck_tile::reference_permute(t_view, {0, 3, 1, 4, 5, 2, 6});
+    }
 }
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
index 2843966cd7..8cf47c46e7 100644
--- a/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
+++ b/include/ck_tile/ops/epilogue/default_2d_epilogue.hpp
@@ -93,13 +93,27 @@ struct Default2DEpilogue
                                    const DsDramWindows& ds_dram_windows,
                                    void* = nullptr) const
     {
+        constexpr bool is_partition_index =
+            std::is_convertible_v<decltype(ds_dram_windows),
+                                  decltype(get_partition_index(
+                                      o_acc_tile.get_tile_distribution()))>;
+
         const auto storeOrUpdateTile = [&](const auto& o_tile) {
             // TODO: this is ugly
             if constexpr(UseRawStore && (kPadM || kPadN))
             {
                 if constexpr(MemoryOperation == memory_operation_enum::set)
                 {
-                    store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_tile));
+                    if constexpr(is_partition_index)
+                    {
+                        store_tile_raw(o_dram_window_tmp,
+                                       cast_tile<ODataType>(o_tile),
+                                       /*partition_index=*/ds_dram_windows);
+                    }
+                    else
+                    {
+                        store_tile_raw(o_dram_window_tmp, cast_tile<ODataType>(o_tile));
+                    }
                 }
                 else
                 {
@@ -111,16 +125,35 @@ struct Default2DEpilogue
             {
                 if constexpr(MemoryOperation == memory_operation_enum::set)
                 {
-                    store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_tile));
+                    if constexpr(is_partition_index)
+                    {
+                        store_tile(o_dram_window_tmp,
+                                   cast_tile<ODataType>(o_tile),
+                                   /*partition_index=*/ds_dram_windows);
+                    }
+                    else
+                    {
+                        store_tile(o_dram_window_tmp, cast_tile<ODataType>(o_tile));
+                    }
                 }
                 else
                 {
-                    update_tile(o_dram_window_tmp, cast_tile<ODataType>(o_tile));
+                    if constexpr(is_partition_index)
+                    {
+                        update_tile(o_dram_window_tmp,
+                                    cast_tile<ODataType>(o_tile),
+                                    /*partition_index=*/ds_dram_windows);
+                    }
+                    else
+                    {
+                        update_tile(o_dram_window_tmp, cast_tile<ODataType>(o_tile));
+                    }
                 }
             }
         };
 
-        if constexpr(!std::is_same_v<DsDramWindows, std::nullptr_t> && Problem::NumDTensor >= 1)
+        if constexpr(!std::is_same_v<DsDramWindows, std::nullptr_t> && !is_partition_index &&
+                     Problem::NumDTensor >= 1)
         {
             using elementwise_result_t = decltype(load_tile(
                 make_tile_window(ds_dram_windows[number<0>{}].get_bottom_tensor_view(),
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 551dc6f50d..a72b1ba544 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -190,7 +190,7 @@ struct GroupedGemmKernel
      */
     CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
     {
-        using ConstantPointer = const void CK_CONSTANT_ADDRESS_SPACE*;
+        using ConstantPointer = const void CK_TILE_CONSTANT_ADDRESS_SPACE*;
         const auto kernel     = kentry<1, Kernel, ConstantPointer, index_t>;
         int occupancy;
         HIP_CHECK_ERROR(
@@ -518,7 +518,7 @@ struct GroupedGemmKernel
 
     // For non-persistent kernels
     template <bool U = UsePersistentKernel, typename = std::enable_if_t<!U>>
-    CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+    CK_TILE_DEVICE void operator()(const void CK_TILE_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
                                    index_t group_count) const
     {
         const index_t block_id   = ck_tile::get_block_1d_id();
@@ -541,7 +541,7 @@ struct GroupedGemmKernel
     template <bool U   = UsePersistentKernel,
               typename = std::enable_if_t<U>,
               typename = void> // extra template parameter to avoid redefinition
-    CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+    CK_TILE_DEVICE void operator()(const void CK_TILE_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
                                    const index_t group_count) const
     {
         const index_t grid_size  = ck_tile::get_grid_size();
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index 915aebd1e6..d8850749f1 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -84,9 +84,10 @@ struct StreamKKernel
     using CLayout = typename GemmPipeline::CLayout;
 
     /// @brief  Specify the data type configurations for A, B, and C
-    using ADataType = typename GemmPipeline::ADataType;
-    using BDataType = typename GemmPipeline::BDataType;
-    using CDataType = typename EpiloguePipeline::ODataType;
+    using ADataType   = typename GemmPipeline::ADataType;
+    using BDataType   = typename GemmPipeline::BDataType;
+    using CDataType   = typename EpiloguePipeline::ODataType;
+    using AccDataType = typename EpiloguePipeline::AccDataType;
 
     template <typename T>
     static constexpr bool is_tuple_v = is_detected<is_tuple, T>::value;
@@ -243,14 +244,6 @@ struct StreamKKernel
 
     CK_TILE_HOST static bool IsSupportedArgument(const StreamKKernelArgs& kargs)
     {
-        if(kargs.reduction_strategy == StreamKReductionStrategy::Reduction)
-        {
-            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
-            {
-                CK_TILE_ERROR("CK Tile Stream-K only supports the atomic reduction strategy.");
-            }
-            return false;
-        }
         return UniversalGemmKernel::IsSupportedArgument(kargs);
     }
 
@@ -258,7 +251,7 @@ struct StreamKKernel
     /// @return The buffer size needed.
     CK_TILE_HOST static uint32_t GetWorkSpaceSize(const StreamKKernelArgs& kargs)
     {
-        return kargs.tile_partitioner.GetWorkSpaceSize(sizeof(CDataType));
+        return kargs.tile_partitioner.get_workspace_size(sizeof(AccDataType));
     }
 
     /// @brief Sets the kargs' current workspace_ptr to the given workspace_ptr.
@@ -299,6 +292,118 @@ struct StreamKKernel
             {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, smem_ptr_0, kargs, num_loop, i_m, i_n, k_size);
     }
 
+    /// @brief Signals that the current thread block (CTA) has completed storing its partial
+    /// results.
+    /// @param kargs Kernel arguments, including the workspace pointer.
+    /// @param cta_idx The index of the current thread block (CTA).
+    /// @note This function utilizes a workgroup barrier to set a synchronization flag for the given
+    /// CTA index.
+    CK_TILE_DEVICE void SignalStorePartialDone(const StreamKKernelArgs& kargs,
+                                               index_t cta_idx) const
+    {
+        auto sk_flags_ptr = static_cast<uint32_t*>(kargs.workspace_ptr);
+        workgroup_barrier sk_flags(sk_flags_ptr);
+        sk_flags.wait_set(0, 1, cta_idx);
+    }
+
+    /// @brief Waits for the thread block (cta_idx) to complete storing its partial results.
+    /// @param kargs Kernel arguments, including the workspace pointer.
+    /// @param cta_idx The index of the thread block (CTA).
+    /// @note This function utilizes a workgroup barrier to wait for the synchronization flag to be
+    /// set by the given CTA index.
+    CK_TILE_DEVICE void WaitStorePartialDone(const StreamKKernelArgs& kargs, index_t cta_idx) const
+    {
+        auto sk_flags_ptr = static_cast<uint32_t*>(kargs.workspace_ptr);
+        workgroup_barrier sk_flags(sk_flags_ptr);
+        sk_flags.wait_eq(1, cta_idx);
+    }
+
+    /// @brief Adds the values of a block tile to an output block tile.
+    /// @param in_out_block_tile The output block tile to which values are added.
+    /// @param in_block_tile The input block tile whose values are added.
+    /// @note This function iterates over the distributed spans of the block tiles and updates the
+    /// output block tile with accumulated values.
+    template <typename OAccTile>
+    CK_TILE_DEVICE void AddBlockTile(OAccTile& in_out_block_tile,
+                                     const OAccTile& in_block_tile) const
+    {
+        using BlockType        = remove_cvref_t<decltype(in_out_block_tile)>;
+        constexpr auto o_spans = BlockType::get_distributed_spans();
+        sweep_tile_span(o_spans[number<0>{}], [&](auto idx0) {
+            sweep_tile_span(o_spans[number<1>{}], [&](auto idx1) {
+                constexpr auto idx     = make_tuple(idx0, idx1);
+                in_out_block_tile(idx) = in_out_block_tile[idx] + in_block_tile[idx];
+            });
+        });
+    }
+
+    /// @brief Loads a partial block tile from the workspace buffer.
+    /// @param kargs Kernel arguments, including the workspace pointer.
+    /// @param cta_idx The index of the thread block (CTA).
+    /// @param c_block_tile_dist The tile distribution for the block.
+    /// @return The loaded partial block tile.
+    /// @note This function calculates the buffer pointer and uses the tile distribution for loading
+    /// the partial block tile.
+    template <typename DataType, typename OAccTileDist>
+    CK_TILE_DEVICE auto LoadPartial(const StreamKKernelArgs& kargs,
+                                    index_t cta_idx,
+                                    const OAccTileDist& c_block_tile_dist) const
+    {
+        const auto c_block_tile_buffer_size =
+            TilePartitioner::MPerBlock * TilePartitioner::NPerBlock * sizeof(DataType);
+        void* partial_buffer_ptr = static_cast<char*>(kargs.workspace_ptr) +
+                                   kargs.tile_partitioner.get_flags_buffer_size() +
+                                   cta_idx * c_block_tile_buffer_size;
+
+        const auto& partial_tensor_view = make_naive_tensor_view<address_space_enum::global>(
+            static_cast<DataType*>(partial_buffer_ptr),
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            make_tuple(TilePartitioner::NPerBlock, 1),
+            number<GemmPipeline::GetVectorSizeC()>{},
+            number<1>{});
+
+        auto partial_tile_window = make_tile_window(
+            partial_tensor_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {0, 0},
+            c_block_tile_dist);
+
+        return load_tile(partial_tile_window);
+    }
+
+    /// @brief Stores a partial block tile to the workspace buffer.
+    /// @param kargs Kernel arguments, including the workspace pointer.
+    /// @param cta_idx The index of the thread block (CTA).
+    /// @param c_block_tile The block tile to be stored.
+    /// @note This function calculates the buffer pointer and uses the tile window for storing the
+    /// partial block tile.
+    template <typename OAccTile>
+    CK_TILE_DEVICE void StorePartial(const StreamKKernelArgs& kargs,
+                                     index_t cta_idx,
+                                     const OAccTile& c_block_tile) const
+    {
+        const auto c_block_tile_buffer_size = TilePartitioner::MPerBlock *
+                                              TilePartitioner::NPerBlock *
+                                              sizeof(typename OAccTile::DataType);
+        void* partial_buffer_ptr = static_cast<char*>(kargs.workspace_ptr) +
+                                   kargs.tile_partitioner.get_flags_buffer_size() +
+                                   cta_idx * c_block_tile_buffer_size;
+
+        const auto& partial_tensor_view = make_naive_tensor_view<address_space_enum::global>(
+            static_cast<typename OAccTile::DataType*>(partial_buffer_ptr),
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            make_tuple(TilePartitioner::NPerBlock, 1),
+            number<GemmPipeline::GetVectorSizeC()>{},
+            number<1>{});
+
+        auto partial_tile_window = make_tile_window(
+            partial_tensor_view,
+            make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
+            {0, 0});
+
+        store_tile(partial_tile_window, c_block_tile);
+    }
+
     /// @brief Runs the main Stream-K algorithm.
     /// @param kargs Stream-K kernel arguments.
     /// @param cta_idx The current Stream-K workgroup's index.
@@ -347,7 +452,88 @@ struct StreamKKernel
             }
             else
             {
-                // TODO: Apply reduction logic.
+                const auto c_macro_tile_idx =
+                    kargs.tile_partitioner.get_output_tile_index(tile_idx);
+                index_t i_m =
+                    c_macro_tile_idx[UniversalGemmKernel::I0] * TilePartitioner::MPerBlock;
+                index_t i_n =
+                    c_macro_tile_idx[UniversalGemmKernel::I1] * TilePartitioner::NPerBlock;
+
+                const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) + i_k_a;
+                const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) + i_k_b;
+                CDataType* c_ptr       = static_cast<CDataType*>(kargs.e_ptr);
+
+                // Create Gemm tensor views, pad views and tile windows
+                const auto& gemm_tensor_views_tuple =
+                    UniversalGemmKernel::template MakeGemmTensorViews<
+                        EpiloguePipeline::MemoryOperation>(
+                        {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, k_size);
+
+                const auto& gemm_pad_views =
+                    UniversalGemmKernel::MakeGemmPadViews(gemm_tensor_views_tuple);
+                auto gemm_tile_windows =
+                    UniversalGemmKernel::MakeGemmTileWindows(gemm_pad_views, i_m, i_n);
+
+                // Run GEMM cooperatively by whole workgroup.
+                const auto& as_block_window = gemm_tile_windows.at(UniversalGemmKernel::I0);
+                const auto& bs_block_window = gemm_tile_windows.at(UniversalGemmKernel::I1);
+                const auto& ds_block_window = gemm_tile_windows.at(UniversalGemmKernel::I2);
+
+                // Since num_loop can vary per WG and per iteration of the Stream-K while loop,
+                // we compute has_hot_loop and tail_num here. This is a similar pattern used by
+                // grouped GEMM. In this case, we call the GemmPipeline's operator() function
+                // that takes both has_hot_loop and tail_num.
+                const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop_sk);
+                const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop_sk);
+
+                const auto& c_block_tile = GemmPipeline{}(as_block_window[UniversalGemmKernel::I0],
+                                                          bs_block_window[UniversalGemmKernel::I0],
+                                                          num_loop_sk,
+                                                          has_hot_loop,
+                                                          tail_num,
+                                                          smem_ptr_0);
+
+                auto tile_started = iter_start == tile_iter_start;
+                auto tile_ended   = iter_end >= tile_iter_end;
+                if(!tile_started)
+                {
+                    StorePartial(kargs, cta_idx, c_block_tile);
+                    // Ensure device-wide visibility of partial results stored in global memory
+                    // before signaling completion. __threadfence() guarantees that all global
+                    // memory writes by this thread are visible to other threads on the device.
+                    __threadfence(); // send signal when the store is done
+                    SignalStorePartialDone(kargs, cta_idx);
+                }
+                else
+                {
+                    auto accum_block_tile = c_block_tile;
+                    if(!tile_ended)
+                    {
+                        const index_t iter_per_tile = kargs.tile_partitioner.get_iters_per_tile();
+                        const index_t iter_per_cta  = kargs.tile_partitioner.get_iters_per_sk_cta();
+                        const index_t extra_iters   = kargs.tile_partitioner.get_extra_iters();
+                        int accum_iters             = local_iter_end - local_iter_start;
+                        int next_cta                = cta_idx + 1;
+
+                        while(accum_iters < iter_per_tile)
+                        {
+                            WaitStorePartialDone(kargs, next_cta);
+
+                            using BlockType = remove_cvref_t<decltype(c_block_tile)>;
+                            AddBlockTile(
+                                accum_block_tile,
+                                LoadPartial<typename BlockType::DataType>(
+                                    kargs, next_cta, c_block_tile.get_tile_distribution()));
+
+                            accum_iters += iter_per_cta + (next_cta < extra_iters);
+                            ++next_cta;
+                        }
+                    }
+
+                    auto& c_block_window = gemm_tile_windows.at(UniversalGemmKernel::I3);
+                    EpiloguePipeline{}(
+                        c_block_window, accum_block_tile, ds_block_window, smem_ptr_0);
+                }
             }
 
             // Prepare for next Stream-K loop iteration.
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp
index e98c60e5f0..996ef5a7ef 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner.hpp
@@ -31,21 +31,20 @@ struct StreamKTilePartitionerBase
 
     StreamKTilePartitionerBase(index_t m, index_t n, index_t k, index_t grid);
 
-    private:
     /**
      * @brief Calculates the total space needed for the partials buffer.
      *
      * @param acc_element_bytes  The number of bytes for the accumulator data type used in the GEMM.
      * @return index_t           The number of bytes needed for the partials buffer.
      */
-    CK_TILE_HOST index_t get_partials_buffer_size(index_t acc_element_bytes) const noexcept;
+    CK_TILE_HOST_DEVICE index_t get_partials_buffer_size(index_t acc_element_bytes) const noexcept;
 
     /**
      * @brief Calculates the total space needed for the flags buffer.
      *
      * @return index_t The number of bytes needed for the flags buffer.
      */
-    CK_TILE_HOST index_t get_flags_buffer_size() const noexcept;
+    CK_TILE_HOST_DEVICE index_t get_flags_buffer_size() const noexcept;
 
     public:
     /**
@@ -123,7 +122,7 @@ struct StreamKTilePartitionerBase
      * @param acc_element_bytes  The number of bytes for the accumulator data type used in the GEMM.
      * @return index_t           The number of bytes needed for the partials and flags buffers.
      */
-    CK_TILE_HOST index_t get_workspace_size(index_t acc_element_bytes) const noexcept;
+    CK_TILE_HOST_DEVICE index_t get_workspace_size(index_t acc_element_bytes) const noexcept;
 
     /**
      * @brief Returns the number of macro tiles in the C tensor.
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp
index 2eefe3420c..b3217624d1 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_tile_partitioner_impl.hpp
@@ -45,7 +45,7 @@ StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::StreamKTi
 }
 
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
-CK_TILE_HOST index_t
+CK_TILE_HOST_DEVICE index_t
 StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_partials_buffer_size(
     index_t acc_element_bytes) const noexcept
 {
@@ -53,7 +53,7 @@ StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_parti
 }
 
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
-CK_TILE_HOST index_t
+CK_TILE_HOST_DEVICE index_t
 StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_flags_buffer_size()
     const noexcept
 {
@@ -116,7 +116,7 @@ StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_outpu
 }
 
 template <typename BlockGemmShapeType, StreamKReductionStrategy ReductionStrategyType>
-CK_TILE_HOST index_t
+CK_TILE_HOST_DEVICE index_t
 StreamKTilePartitionerBase<BlockGemmShapeType, ReductionStrategyType>::get_workspace_size(
     index_t acc_element_bytes) const noexcept
 {
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp
index 91da3cd27b..b293097d89 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_async.hpp
@@ -164,6 +164,13 @@ struct GemmPipelineAgBgCrCompAsync : public BaseGemmPipelineAgBgCrCompAsync<Prob
     static constexpr auto is_a_load_tr_v = bool_constant<PipelineImplBase::is_a_load_tr>{};
     static constexpr auto is_b_load_tr_v = bool_constant<PipelineImplBase::is_b_load_tr>{};
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
+    {
+        // clang-format off
+        return "COMPUTE_ASYNC";
+        // clang-format on
+    }
+
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
     {
         return Policy::template GetSmemSize<Problem>();
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index aaa04615fd..a1bbcbe990 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -170,6 +170,13 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     using Base::PrefetchStages;
     using Base::UsePersistentKernel;
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
+    {
+        // clang-format off
+        return "COMPUTE_V3";
+        // clang-format on
+    }
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index ff1e33bd5d..238b4e2389 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -172,6 +172,13 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     static constexpr auto is_a_load_tr_v = bool_constant<PipelineImplBase::is_a_load_tr>{};
     static constexpr auto is_b_load_tr_v = bool_constant<PipelineImplBase::is_b_load_tr>{};
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
+    {
+        // clang-format off
+        return "COMPUTE_V4";
+        // clang-format on
+    }
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
index 7263ddd5a1..6343ff9872 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v5.hpp
@@ -99,6 +99,13 @@ struct GemmPipelineAgBgCrCompV5 : public BaseGemmPipelineAgBgCrCompV5<Problem>
     static constexpr index_t NumWarps  = BlockGemmShape::NumWarps;
     static constexpr index_t KTileSize = BlockGemmShape::WarpTile::at(I2{});
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
+    {
+        // clang-format off
+        return "COMPUTE_V5";
+        // clang-format on
+    }
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp
index 2ae9001098..5b57560f6e 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v6.hpp
@@ -159,6 +159,13 @@ struct GemmPipelineAgBgCrCompV6 : public BaseGemmPipelineAgBgCrCompV6<Problem>
     static constexpr auto is_a_load_tr_v = bool_constant<BasePImpl::is_a_load_tr>{};
     static constexpr auto is_b_load_tr_v = bool_constant<BasePImpl::is_b_load_tr>{};
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
+    {
+        // clang-format off
+        return "COMPUTE_V6";
+        // clang-format on
+    }
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index d363626efd..ba71e3b6cb 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -214,6 +214,13 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem<Problem>
     static constexpr auto is_a_load_tr_v = bool_constant<PipelineImplBase::is_a_load_tr>{};
     static constexpr auto is_b_load_tr_v = bool_constant<PipelineImplBase::is_b_load_tr>{};
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
+    {
+        // clang-format off
+        return "MEMORY";
+        // clang-format on
+    }
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
index eb363d59b8..8a4fb59b51 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -70,6 +70,13 @@ struct GemmPipelineAGmemBGmemCRegV1
 
     static constexpr index_t kLdsAlignmentInBytes = 16;
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
+    {
+        // clang-format off
+        return "BASIC_V1";
+        // clang-format on
+    }
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
index c309f8908a..32217e0024 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -70,6 +70,13 @@ struct GemmPipelineAGmemBGmemCRegV2
     // For the basic gemm pipelien DoubleSmemBuffer set to be false naturally.
     static constexpr bool DoubleSmemBuffer = false;
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
+    {
+        // clang-format off
+        return "BASIC_V2";
+        // clang-format on
+    }
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
index 87f6c753b4..cae2bd0e9f 100644
--- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp
@@ -176,6 +176,13 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2
     static constexpr index_t dswrite_mIter  = (DsWritePreIssue - 1) % MIterPerWarp;
     static constexpr index_t dswrite_kIter  = (DsWritePreIssue - 1) / MIterPerWarp;
 
+    [[nodiscard]] CK_TILE_HOST static const std::string GetPipelineName()
+    {
+        // clang-format off
+        return "PRESHUFFLE_V2";
+        // clang-format on
+    }
+
     [[nodiscard]] CK_TILE_HOST static const std::string GetName()
     {
         // clang-format off
diff --git a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
index 90f6204ff3..dd2931f6b7 100644
--- a/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
+++ b/include/ck_tile/ops/gemm/warp/warp_gemm_attribute_wmma.hpp
@@ -79,6 +79,7 @@ struct WarpGemmAttributeWmma
     static constexpr index_t kM          = Impl::kM;
     static constexpr index_t kN          = Impl::kN;
     static constexpr index_t kK          = Impl::kK;
+    static constexpr index_t kCMLane     = Impl::kCMLane;
     static constexpr index_t kKPerThread = Impl::kABK0PerLane * Impl::kABK1PerLane;
 
     CK_TILE_HOST_DEVICE static constexpr auto get_num_of_access() { return 1; }
diff --git a/include/ck_tile/ops/gemm_quant.hpp b/include/ck_tile/ops/gemm_quant.hpp
index 3273131875..3e16d937cb 100644
--- a/include/ck_tile/ops/gemm_quant.hpp
+++ b/include/ck_tile/ops/gemm_quant.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include "ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp"
 #include "ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp"
 #include "ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp"
 #include "ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp"
diff --git a/include/ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp b/include/ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp
new file mode 100644
index 0000000000..d695888b88
--- /dev/null
+++ b/include/ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+
+namespace ck_tile {
+
+// Common utilities for quantized GEMM block operations
+template <typename CDataType,
+          typename WarpGemmType,
+          index_t MIterPerWarp,
+          index_t MWarp,
+          index_t NIterPerWarp,
+          index_t NWarp>
+struct BlockGemmQuantCommon
+{
+    CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
+    {
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WarpGemmType::CWarpDstrEncoding{});
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
+
+        return c_block_tensor;
+    }
+};
+
+} // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
index df55081b69..6422c07e1d 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_ar_flatbr_bquant_cr.hpp
@@ -5,6 +5,7 @@
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/block/block_wp_asmem_bsmem_creg_v1_custom_policy.hpp"
+#include "ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp"
 
 namespace ck_tile {
 
@@ -81,11 +82,11 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
         float scale_reg_f = 0.f;
         if constexpr(std::is_same_v<BQDataType, ck_tile::fp8_t>)
         {
-            scale_reg_f = element_wise::amd_assembly_fp8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_fp8(static_cast<uint32_t>(scale), 0);
         }
         else if constexpr(std::is_same_v<BQDataType, ck_tile::bf8_t>)
         {
-            scale_reg_f = element_wise::amd_assembly_bf8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_bf8(static_cast<uint32_t>(scale), 0);
         }
         else if constexpr(std::is_same_v<BQDataType, float>)
         {
@@ -100,21 +101,8 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
 
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
-        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<1, 2>,
-            sequence<0, 0>>{};
-
-        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WG::CWarpDstrEncoding{});
-
-        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
-
-        auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
-        return c_block_tensor;
+        return BlockGemmQuantCommon<CDataType, WG, MIterPerWarp, MWarp, NIterPerWarp, NWarp>::
+            MakeCBlockTile();
     }
 
     // C += A * B
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
index 8b95ec6ddf..bbdd3128bf 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp"
 
 namespace ck_tile {
 
@@ -24,13 +25,11 @@ struct BlockGemmAQuantBase
         float scale_reg_f = 0.f;
         if constexpr(std::is_same_v<AQDataType, ck_tile::fp8_t>)
         {
-            scale_reg_f =
-                ck_tile::element_wise::amd_assembly_fp8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_fp8(static_cast<uint32_t>(scale), 0);
         }
         else if constexpr(std::is_same_v<AQDataType, ck_tile::bf8_t>)
         {
-            scale_reg_f =
-                ck_tile::element_wise::amd_assembly_bf8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_bf8(static_cast<uint32_t>(scale), 0);
         }
         else if constexpr(std::is_same_v<AQDataType, float>)
         {
@@ -348,7 +347,7 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
                         // Thread 0 can read AQ_tile[0, 0] from itself, AQ_tile[1,
                         // 0] from thread 1, ..., and AQ_tile[3, 0] from thread 3.
 
-                        constexpr uint32_t kTileRowsOfCPerThread = 4;
+                        constexpr uint32_t kTileRowsOfCPerThread = (get_warp_size() == 64) ? 4 : 8;
                         decltype(threadIdx.x) pull_from_lane     = 0;
                         if constexpr(WarpGemm::kM == 16)
                         {
@@ -409,7 +408,8 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
                         // desired row coefficient
                         auto& scale_reg = aq_block_tensor.get_thread_buffer()[src_reg_offset];
 
-                        constexpr uint32_t kTileRows               = 4;
+                        constexpr uint32_t kTileRows = (get_warp_size() == 64) ? 4 : 8;
+                        ;
                         constexpr uint32_t kTiledCMsPerWarp        = WarpGemm::kCMLane * kTileRows;
                         constexpr uint32_t reg_offset_for_row_data = c_row * WarpGemm::kCMLane;
                         // Multiply by 4 because output is stored in tiles of 4
@@ -543,20 +543,8 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
     public:
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
-        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<1, 2>,
-            sequence<0, 0>>{};
-
-        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
-        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
-        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
-
-        return c_block_tensor;
+        return BlockGemmQuantCommon<CDataType, WarpGemm, MIterPerWarp, MWarp, NIterPerWarp, NWarp>::
+            MakeCBlockTile();
     }
 
     template <typename ASmemBlockWindow, typename BSmemBlockWindow>
diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
index 9db444b57f..28ae709bf0 100644
--- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
+++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp
@@ -9,6 +9,7 @@
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
 #include "ck_tile/ops/elementwise.hpp"
+#include "ck_tile/ops/gemm_quant/block/block_gemm_quant_common.hpp"
 
 namespace ck_tile {
 
@@ -24,13 +25,11 @@ struct BlockGemmBQuantBase
         float scale_reg_f = 0.f;
         if constexpr(std::is_same_v<BQDataType, ck_tile::fp8_t>)
         {
-            scale_reg_f =
-                ck_tile::element_wise::amd_assembly_fp8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_fp8(static_cast<uint32_t>(scale), 0);
         }
         else if constexpr(std::is_same_v<BQDataType, ck_tile::bf8_t>)
         {
-            scale_reg_f =
-                ck_tile::element_wise::amd_assembly_bf8_to_fp32(static_cast<uint32_t>(scale));
+            scale_reg_f = __builtin_amdgcn_cvt_f32_bf8(static_cast<uint32_t>(scale), 0);
         }
         else if constexpr(std::is_same_v<BQDataType, float>)
         {
@@ -376,20 +375,8 @@ struct BQuantBlockUniversalGemmAsBsCr : public BlockGemmBQuantBase<Problem_>
     public:
     CK_TILE_DEVICE static constexpr auto MakeCBlockTile()
     {
-        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
-            sequence<>,
-            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
-            tuple<sequence<1, 2>>,
-            tuple<sequence<1, 1>>,
-            sequence<1, 2>,
-            sequence<0, 0>>{};
-
-        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
-            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
-        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
-        auto c_block_tensor         = make_static_distributed_tensor<CDataType>(c_block_dstr);
-
-        return c_block_tensor;
+        return BlockGemmQuantCommon<CDataType, WarpGemm, MIterPerWarp, MWarp, NIterPerWarp, NWarp>::
+            MakeCBlockTile();
     }
 
     template <typename ASmemBlockWindow, typename BSmemBlockWindow>
diff --git a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
index 36cbb87877..15d2727f3b 100644
--- a/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_quant/kernel/gemm_quant_kernel.hpp
@@ -240,7 +240,10 @@ struct QuantGemmKernel
         return dim3(TilePartitioner::GridSize(M, N), 1, KBatch);
     }
 
-    CK_TILE_HOST static constexpr auto BlockSize() { return dim3(kBlockSize); }
+    CK_TILE_HOST static auto BlockSize()
+    {
+        return is_wave32() ? dim3(kBlockSize / 2) : dim3(kBlockSize);
+    }
 
     CK_TILE_HOST static constexpr QuantGemmKernelArgs
     MakeKernelArgs(const QuantGemmHostArgs& hostArgs)
diff --git a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp
index 75ac1ca6ab..32f1279e93 100644
--- a/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp
+++ b/include/ck_tile/ops/gemm_quant/kernel/grouped_gemm_quant_kernel.hpp
@@ -208,7 +208,7 @@ struct QuantGroupedGemmKernel
      */
     CK_TILE_HOST static auto MaxOccupancyGridSize(const stream_config& s) -> dim3
     {
-        using ConstantPointer  = const void CK_CONSTANT_ADDRESS_SPACE*;
+        using ConstantPointer  = const void CK_TILE_CONSTANT_ADDRESS_SPACE*;
         const auto kernel_func = kentry<1, Kernel, ConstantPointer, index_t>;
         int occupancy;
         HIP_CHECK_ERROR(
@@ -499,7 +499,7 @@ struct QuantGroupedGemmKernel
     template <bool U   = UsePersistentKernel,
               typename = std::enable_if_t<U>,
               typename = void> // extra template parameter to avoid redefinition
-    CK_TILE_DEVICE void operator()(const void CK_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
+    CK_TILE_DEVICE void operator()(const void CK_TILE_CONSTANT_ADDRESS_SPACE* gemm_descs_const,
                                    const index_t group_count) const
     {
         const index_t grid_size  = ck_tile::get_grid_size();
diff --git a/include/ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp b/include/ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp
index c4429b76f9..3a5b86382d 100644
--- a/include/ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp
+++ b/include/ck_tile/ops/gemm_quant/pipeline/tile_gemm_quant_traits.hpp
@@ -41,7 +41,8 @@ template <bool kPadM_,
           typename BQLayout_        = BLayout_,
           bool TransposeC_          = false,
           bool DoubleSmemBuffer_    = false,
-          bool UsePersistentKernel_ = false>
+          bool UsePersistentKernel_ = false,
+          int VectorSize_           = 16>
 struct TileGemmQuantTraits
 {
     static constexpr bool kPadM = kPadM_;
@@ -50,7 +51,7 @@ struct TileGemmQuantTraits
 
     static constexpr QuantType kQuantType = QuantType_;
 
-    static constexpr int _VectorSize       = 16;
+    static constexpr int _VectorSize       = VectorSize_;
     static constexpr bool DoubleSmemBuffer = DoubleSmemBuffer_;
 
     using ALayout  = ALayout_;
diff --git a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
index 6dd9eca9ff..6de331fe6d 100644
--- a/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
+++ b/include/ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp
@@ -16,10 +16,14 @@
 #include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
 #include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
 
+#ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/instance_traits_tile_grouped_convolution_forward.hpp"
+#endif
+
 namespace ck_tile {
 
 /// @brief The Grouped Convolution kernel device arguments.
-template <typename GroupedConvTraitsType_>
+template <typename GroupedConvTraitsType_, typename CDElementwise_>
 struct GroupedConvFwdKernelArgs
 {
 
@@ -31,7 +35,7 @@ struct GroupedConvFwdKernelArgs
                                GroupedConvTraitsType_::VectorSizeC,
                                GroupedConvTraitsType_::NumGroupsToMerge,
                                true>; // Split N enabled
-    using CDElementwise                 = typename GroupedConvTraitsType_::CDElementwise;
+    using CDElementwise                 = CDElementwise_;
     static constexpr index_t NumDTensor = GroupedConvTraitsType_::NumDTensor;
 
     template <
@@ -434,14 +438,13 @@ struct GroupedConvFwdKernelArgs
 ///                                     multiplication implementation. It is responsible for storing
 ///                                     results calculated by @ref GemmPipeline_ "GemmPipeline" to
 ///                                     the output C tensor in global memory.
-template <bool EnableSplitImage_,
-          typename GroupedConvTraitsType_,
+template <typename GroupedConvTraitsType_,
           typename TilePartitioner_,
           typename GemmPipeline_,
           typename EpiloguePipeline_>
 struct GroupedConvolutionForwardKernel
 {
-    static constexpr bool EnableSplitImage = EnableSplitImage_;
+    static constexpr bool EnableSplitImage = GroupedConvTraitsType_::EnableSplitImage;
     static constexpr index_t NDimSpatial   = GroupedConvTraitsType_::NDimSpatial;
     static constexpr ConvolutionSpecialization ConvSpecialization =
         GroupedConvTraitsType_::ConvSpecialization;
@@ -470,7 +473,8 @@ struct GroupedConvolutionForwardKernel
 
     using CDElementwise = typename EpiloguePipeline::CDElementwise;
 
-    using GroupedConvFwdKernelArgsSpecialized = GroupedConvFwdKernelArgs<GroupedConvTraitsType_>;
+    using GroupedConvFwdKernelArgsSpecialized =
+        GroupedConvFwdKernelArgs<GroupedConvTraitsType_, CDElementwise>;
 
     static constexpr bool IsSplitKSupported = false;
 
@@ -568,6 +572,19 @@ struct GroupedConvolutionForwardKernel
         // clang-format on
     }
 
+#ifdef CK_EXPERIMENTAL_BUILDER
+    CK_TILE_HOST std::string GetInstanceString() const
+    {
+        static_assert(ck_tile::reflect::HasInstanceTraits<GroupedConvolutionForwardKernel>,
+                      "Specialization of instance_traits not found. Please check that a "
+                      "specialization exists in file "
+                      "ck_tile/builder/reflect/"
+                      "instance_traits_tile_grouped_convolution_forward.hpp "
+                      "for the given template parameters.");
+        return ck_tile::reflect::instance_string<GroupedConvolutionForwardKernel>();
+    }
+#endif
+
     CK_TILE_HOST static auto GridSize(const GroupedConvFwdKernelArgsSpecialized& kargs)
     {
         return dim3(
diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
index 8695fecac6..8ea6cffa7d 100644
--- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
+++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp
@@ -63,7 +63,7 @@ template <index_t NDimSpatial_,
           index_t VectorSizeB_      = 1,
           index_t VectorSizeC_      = 1,
           index_t NumGroupsToMerge_ = 1,
-          typename CDElementwise_   = PassThrough>
+          bool EnableSplitImage_    = false>
 struct GroupedConvTraits
 {
     private:
@@ -74,6 +74,22 @@ struct GroupedConvTraits
     }
 
     public:
+    // Fixed values for Implicit GEMM
+    struct FixedGemmParams
+    {
+        static constexpr ck_tile::index_t TilePartitionerGroupNum = 8;
+        static constexpr ck_tile::index_t TilePartitionerM01      = 4;
+        static constexpr bool kPadM                               = true;
+        static constexpr bool kPadN                               = true;
+        static constexpr bool kPadK                               = true;
+        static constexpr bool TransposeC                          = false;
+        static constexpr bool FixedVectorSize                     = true;
+        static constexpr bool UseStructuredSparsity               = false;
+        static constexpr bool Persistent                          = false;
+        using ELayout = ck_tile::tensor_layout::gemm::RowMajor;
+    };
+    // Compile time parameters
+    static constexpr bool EnableSplitImage                        = EnableSplitImage_;
     static constexpr index_t NumGroupsToMerge                     = NumGroupsToMerge_;
     static constexpr index_t NDimSpatial                          = NDimSpatial_;
     static constexpr ConvolutionSpecialization ConvSpecialization = ConvSpecialization_;
@@ -81,32 +97,43 @@ struct GroupedConvTraits
     using WeiLayout                                               = WeiLayout_;
     using DsLayout                                                = DsLayout_;
     using OutLayout                                               = OutLayout_;
-    using CDElementwise                                           = CDElementwise_;
+
+    // Forward Gemm Layouts
+    using AsLayoutFwd = ck_tile::tensor_layout::gemm::RowMajor;
+    using BsLayoutFwd = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using CLayoutFwd  = ck_tile::tensor_layout::gemm::RowMajor;
+    // Backward Data Gemm Layouts
+    using AsLayoutBwdData = ck_tile::tensor_layout::gemm::RowMajor;
+    using BsLayoutBwdData = ck_tile::tensor_layout::gemm::RowMajor;
+    using CLayoutBwdData  = ck_tile::tensor_layout::gemm::RowMajor;
+    // Backward Weight Gemm Layouts
+    using AsLayoutBwdWeight = ck_tile::tensor_layout::gemm::ColumnMajor;
+    using BsLayoutBwdWeight = ck_tile::tensor_layout::gemm::RowMajor;
+    using CLayoutBwdWeight  = ck_tile::tensor_layout::gemm::RowMajor;
+
+    template <ck_tile::index_t NumWaveGroups = 1>
     using GroupedConvImplicitGemmTraitsFwd =
-        TileGemmTraits<true,
-                       true,
-                       true,
-                       ck_tile::tensor_layout::gemm::RowMajor,
-                       ck_tile::tensor_layout::gemm::ColumnMajor,
-                       ck_tile::tensor_layout::gemm::RowMajor>;
-    using GroupedConvImplicitGemmTraitsBwdData =
-        TileGemmTraits<true,
-                       true,
-                       true,
-                       ck_tile::tensor_layout::gemm::RowMajor,
-                       ck_tile::tensor_layout::gemm::RowMajor,
-                       ck_tile::tensor_layout::gemm::RowMajor>;
-    using GroupedConvImplicitGemmTraitsBwdWeight =
-        TileGemmTraits<true,
-                       true,
-                       true,
-                       ck_tile::tensor_layout::gemm::ColumnMajor,
-                       ck_tile::tensor_layout::gemm::RowMajor,
-                       ck_tile::tensor_layout::gemm::RowMajor>;
+        TileGemmTraits<true, true, true, AsLayoutFwd, BsLayoutFwd, CLayoutFwd, NumWaveGroups>;
+    template <ck_tile::index_t NumWaveGroups = 1>
+    using GroupedConvImplicitGemmTraitsBwdData = TileGemmTraits<true,
+                                                                true,
+                                                                true,
+                                                                AsLayoutBwdData,
+                                                                BsLayoutBwdData,
+                                                                CLayoutBwdData,
+                                                                NumWaveGroups>;
+    template <ck_tile::index_t NumWaveGroups = 1>
+    using GroupedConvImplicitGemmTraitsBwdWeight  = TileGemmTraits<true,
+                                                                   true,
+                                                                   true,
+                                                                   AsLayoutBwdWeight,
+                                                                   BsLayoutBwdWeight,
+                                                                   CLayoutBwdWeight,
+                                                                   NumWaveGroups>;
     static constexpr ck_tile::index_t VectorSizeA = VectorSizeA_;
     static constexpr ck_tile::index_t VectorSizeB = VectorSizeB_;
     static constexpr ck_tile::index_t VectorSizeC = VectorSizeC_;
-    static constexpr index_t NumDTensor           = DsLayout::size();
+    static constexpr ck_tile::index_t NumDTensor  = DsLayout::size();
     using ImplicitGemmDsLayout                    = decltype(generate_implicit_gemm_layout());
 };
 
diff --git a/include/ck_tile/ops/reduce/block/block_reduce.hpp b/include/ck_tile/ops/reduce/block/block_reduce.hpp
index 7a10d1fa56..2fd8a48eee 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce.hpp
@@ -32,7 +32,7 @@ CK_TILE_DEVICE void block_tile_reduce_sync(AccDistributedTensor_& acc_tensor,
 
     constexpr index_t idim_p_lane = NDimP - 1;
 
-    const auto ps_idx = detail::get_partition_index(acc_tensor.get_tile_distribution());
+    const auto ps_idx = get_partition_index(acc_tensor.get_tile_distribution());
     const auto rs_idx = acc_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
 
     constexpr index_t thread_buf_size = AccDistributedTensor_::get_thread_buffer_size();
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
index 71b5c5e7cf..806b6e684d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_kn_mn.hpp
@@ -48,7 +48,9 @@ using device_gemm_wmma_universal_f16_f16_f16_km_kn_mn_comp_instances = std::tupl
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   2,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   2,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
index f4489dc45f..4516d06492 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_km_nk_mn.hpp
@@ -50,7 +50,9 @@ using device_gemm_wmma_universal_f16_f16_f16_km_nk_mn_comp_instances = std::tupl
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   2,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              2,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   2,   16,   16,       4,       2,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
index 423f86365c..5ace0594f0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -53,7 +53,9 @@ using device_gemm_wmma_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tupl
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   2,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   2,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
index 2eb28958e6..27deab1c8c 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f16_f16/device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn.hpp
@@ -56,7 +56,9 @@ using device_gemm_wmma_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tupl
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   160,    64,   8,   8,   16,   16,       2,       5,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   2,   8,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,    64,    64,   8,   2,   16,   16,       4,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              2,              2,         0,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp
index d10b9facd5..bd5c7d8783 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_kn_mn.hpp
@@ -48,7 +48,8 @@ using device_gemm_wmma_universal_f16_f8_f16_km_kn_mn_comp_instances = std::tuple
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F16,   F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,  16,   16,   16,       2,       8,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp
index d9d16ede65..1956d1a951 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_km_nk_mn.hpp
@@ -49,7 +49,8 @@ using device_gemm_wmma_universal_f16_f8_f16_km_nk_mn_comp_instances = std::tuple
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    80,    64,   8,   8,   16,   16,       1,       5,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,  16,   16,   16,       2,       8,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp
index 9277e5e901..934c6aa7ef 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn.hpp
@@ -51,7 +51,8 @@ using device_gemm_wmma_universal_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,  16,   16,   16,       2,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp
index e97a649c19..9860b81b78 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f16_f8_f16/device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn.hpp
@@ -51,7 +51,8 @@ using device_gemm_wmma_universal_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,  F16,    F8,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,   8,  16,   16,   16,       2,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp
index c8f1b85ddb..4d7169565a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_kn_mn.hpp
@@ -49,7 +49,8 @@ using device_gemm_wmma_universal_f8_f16_f16_km_kn_mn_comp_instances = std::tuple
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,  16,   8,   16,   16,       2,       8,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp
index fc0220a502..3728368bc4 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_km_nk_mn.hpp
@@ -51,7 +51,8 @@ using device_gemm_wmma_universal_f8_f16_f16_km_nk_mn_comp_instances = std::tuple
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Col,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,  16,   8,   16,   16,       2,       8,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,             16,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp
index b87cf64b0f..3506575f5d 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn.hpp
@@ -51,7 +51,8 @@ using device_gemm_wmma_universal_f8_f16_f16_mk_kn_mn_comp_instances = std::tuple
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    64,    32,    64,   8,   8,   16,   16,       4,       1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,  16,   8,   16,   16,       2,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp
index 31ad66409e..eef0d6de6a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f16_f16/device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn.hpp
@@ -50,7 +50,8 @@ using device_gemm_wmma_universal_f8_f16_f16_mk_nk_mn_comp_instances = std::tuple
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,   128,   128,    32,   8,   8,   16,   16,       4,       4,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,  16,   8,   16,   16,       2,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
index 4c37c398fe..2418be62b7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn.hpp
@@ -55,7 +55,8 @@ using device_gemm_wmma_universal_f8_f8_bf16_mk_kn_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   128,    64,    64,    32,   8,   8,   16,   16,       2,       2,       S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,         1,          1,          1,        S<1, 32, 1, 2>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Row,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,  16,  16,   16,   16,       2,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,       S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,             16,         1,          1,          1,        S<1, 64, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>
         // clang-format on
         >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
index 6b5314b701..38f2869303 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_wmma_universal_f8_f8_bf16/device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -51,7 +51,8 @@ using device_gemm_wmma_universal_f8_f8_bf16_mk_nk_mn_comp_instances =
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Interwave, BlockGemmPipelineVersion::v1,      F8,      F8>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    32,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
         DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   128,    64,   8,   8,   16,   16,       4,       2,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,          1,          1,        S<1, 32, 1, 8>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
-        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,    64,    32,    64,    64,   8,   8,   16,   16,       2,       2,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,       S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v3,      F8,      F8>,
+        DeviceGemm_Wmma_CShuffleV3<      Row,     Col,     Row,   F8,    F8,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough, GemmSpec,   256,   128,   256,    64,  16,  16,   16,   16,       2,       8,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,       S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,          1,          1,        S<1, 16, 1, 4>,                     8,  Intrawave, BlockGemmPipelineVersion::v1,      F8,      F8>
         // clang-format on
         >;
 } // namespace instance
diff --git a/profiler/README.md b/profiler/README.md
index 05bbc7b4f9..86f668eacb 100644
--- a/profiler/README.md
+++ b/profiler/README.md
@@ -1,5 +1,23 @@
 [Back to the main page](../README.md)
 # Composable Kernel profiler
+
+## Building Specific Profilers
+To reduce build time, filter which operations to compile using CMake options:
+
+```bash
+# Build all grouped_gemm variants (grouped_gemm, grouped_gemm_fastgelu, grouped_gemm_tile_loop, etc.)
+cmake -DCK_PROFILER_OP_FILTER="grouped_gemm" <other options> ..
+
+# Build ONLY base grouped_gemm (excludes variants - use exact regex match with ^ and $)
+cmake -DCK_PROFILER_OP_FILTER="^grouped_gemm$" <other options> ..
+```
+
+Both `CK_PROFILER_OP_FILTER` and `CK_PROFILER_INSTANCE_FILTER` accept regex patterns. Default builds all operations.
+
+To find the complete list of operations, run the following command:
+```bash
+find profiler/src -name "profile_*.cpp" | sed 's|profiler/src/profile_||' | sed 's|.cpp||' | sort
+```
 ## Profiler GEMM UNIVERSAL kernels
 ```bash
 # arg1: tensor operation (gemm_universal: Universal GEMM)
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index 9f86f6d88f..c22867fbed 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -40,6 +40,9 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND PROFILER_OPS profile_contraction_bilinear.cpp)
     list(APPEND PROFILER_OPS profile_contraction_scale.cpp)
   endif()
+endif()
+
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx1[12]")  
   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
     list(APPEND PROFILER_OPS profile_gemm_reduce.cpp)
     list(APPEND PROFILER_OPS profile_batched_gemm_add_relu_gemm_add.cpp)
@@ -53,7 +56,7 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
     list(APPEND PROFILER_OPS profile_grouped_gemm_tile_loop.cpp)
     list(APPEND PROFILER_OPS profile_grouped_gemm_multiply_tile_loop.cpp)
   endif()
-  if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]")
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx9[45]" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
     list(APPEND PROFILER_OPS profile_gemm_multiply_multiply_wp.cpp)
     list(APPEND PROFILER_OPS profile_gemm_ab_scale.cpp)
     list(APPEND PROFILER_OPS profile_gemm_blockscale_wp.cpp)
@@ -74,7 +77,6 @@ if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
   list(APPEND PROFILER_OPS profile_conv_bwd_data.cpp)
   list(APPEND PROFILER_OPS profile_conv_fwd.cpp)
   list(APPEND PROFILER_OPS profile_grouped_conv_fwd_outelementop.cpp)
-
 endif()
 
 if((SUPPORTED_GPU_TARGETS MATCHES "gfx9" AND (DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)) OR
diff --git a/profiler/src/profiler_operation_registry.hpp b/profiler/src/profiler_operation_registry.hpp
index 276b7b38dc..7e6d22d4ce 100644
--- a/profiler/src/profiler_operation_registry.hpp
+++ b/profiler/src/profiler_operation_registry.hpp
@@ -74,6 +74,6 @@ class ProfilerOperationRegistry final
 #define PP_CONCAT(x, y) PP_CONCAT_IMPL(x, y)
 #define PP_CONCAT_IMPL(x, y) x##y
 
-#define REGISTER_PROFILER_OPERATION(name, description, operation)              \
-    static const bool PP_CONCAT(operation_registration_result_, __COUNTER__) = \
+#define REGISTER_PROFILER_OPERATION(name, description, operation)                            \
+    __extension__ static const bool PP_CONCAT(operation_registration_result_, __COUNTER__) = \
         ::ProfilerOperationRegistry::GetInstance().Add(name, description, operation)
diff --git a/script/check_copyright_year.sh b/script/check_copyright_year.sh
index f7709472ef..1b63c6b711 100755
--- a/script/check_copyright_year.sh
+++ b/script/check_copyright_year.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 current_year=$(date +%Y)
 exit_code=0
diff --git a/script/clang-format-overwrite.sh b/script/clang-format-overwrite.sh
index 74391ded28..23b57b9935 100755
--- a/script/clang-format-overwrite.sh
+++ b/script/clang-format-overwrite.sh
@@ -1,2 +1,5 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 find . -name deps -prune -o -name build -prune -o -iname '*.h' -o -iname '*.hpp' -o -iname '*.cpp' -o -iname '*.h.in' -o -iname '*.hpp.in' -o -iname '*.cpp.in' -o -iname '*.cl' -o -iname '*.cuh' -o -iname '*.cu' -o -iname '*.inc' | grep -v 'build/' | grep -v 'include/rapidjson'| xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
 git status --porcelain | awk '$1 != "D" && (match($2, "\\.cpp|.hpp|.inc|include/rapidjson/")) {print $2}' | xargs -n 1 -P 16 -I{} -t sh -c 'clang-format-18 -i -style=file {}'
diff --git a/script/cmake-ck-dev.sh b/script/cmake-ck-dev.sh
index 6220009b03..9643af1de0 100755
--- a/script/cmake-ck-dev.sh
+++ b/script/cmake-ck-dev.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 # exit when a command exits with non-zero status; also when an unbound variable is referenced
 set -eu
 # pipefail is supported by many shells, not supported by sh and dash
diff --git a/script/convert_miopen_driver_to_profiler.py b/script/convert_miopen_driver_to_profiler.py
index d814e0719c..5aff9c0a7f 100644
--- a/script/convert_miopen_driver_to_profiler.py
+++ b/script/convert_miopen_driver_to_profiler.py
@@ -1,5 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
 # Convert miopen driver command to ck Profiler
 # Example: python3 ../script/convert_miopen_driver_to_profiler.py
 # /opt/rocm/bin/MIOpenDriver conv -n 32 -c 64 -H 28 -W 28 -k 64 -y 3 -x 3
diff --git a/script/count_vgpr.sh b/script/count_vgpr.sh
index 07debc53a8..651a894db6 100755
--- a/script/count_vgpr.sh
+++ b/script/count_vgpr.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 FILE=$1
 
 for num in {0..255}
diff --git a/script/dependency-parser/generate_list_of_files_not_referenced_in_tests.py b/script/dependency-parser/generate_list_of_files_not_referenced_in_tests.py
index 8419b9491e..58bb9e8e93 100644
--- a/script/dependency-parser/generate_list_of_files_not_referenced_in_tests.py
+++ b/script/dependency-parser/generate_list_of_files_not_referenced_in_tests.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
-## Copyright © Advanced Micro Devices, Inc. or its affiliates.
-## SPDX-License-Identifier: MIT
 
 # This script generate list of files that are not referenced from any test (list in JSON format)
 # Script only looks at not referenced files from three directories: include, library and profiler
diff --git a/script/dependency-parser/main.py b/script/dependency-parser/main.py
index 623ae05afd..f345362b26 100644
--- a/script/dependency-parser/main.py
+++ b/script/dependency-parser/main.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
 """
diff --git a/script/dependency-parser/src/enhanced_ninja_parser.py b/script/dependency-parser/src/enhanced_ninja_parser.py
index ff6344a4c1..2ac8e8537a 100644
--- a/script/dependency-parser/src/enhanced_ninja_parser.py
+++ b/script/dependency-parser/src/enhanced_ninja_parser.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
 """
diff --git a/script/dependency-parser/src/selective_test_filter.py b/script/dependency-parser/src/selective_test_filter.py
index d3228ef624..83f7f7eebe 100644
--- a/script/dependency-parser/src/selective_test_filter.py
+++ b/script/dependency-parser/src/selective_test_filter.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
 """
diff --git a/script/gemm_profile.sh b/script/gemm_profile.sh
index 89419ca711..d3d66bcaa9 100755
--- a/script/gemm_profile.sh
+++ b/script/gemm_profile.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
 BIN=./bin/tile_example_gemm_weight_preshuffle
diff --git a/script/hipclang_opt.sh b/script/hipclang_opt.sh
index c51bd51d97..ba5636eeb6 100755
--- a/script/hipclang_opt.sh
+++ b/script/hipclang_opt.sh
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 rm *.ll *.s
 
 BC_FILE=$1
diff --git a/script/install_precommit.sh b/script/install_precommit.sh
index 545dcfa666..f80b06a95a 100755
--- a/script/install_precommit.sh
+++ b/script/install_precommit.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 run_and_check() {
     "$@"
     status=$?
diff --git a/script/launch_tests.sh b/script/launch_tests.sh
index 17a99e62a3..1911613023 100755
--- a/script/launch_tests.sh
+++ b/script/launch_tests.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
 # Get the directory where the script is located
@@ -40,20 +40,88 @@ python3 "$SCRIPT_DIR/dependency-parser/main.py" select "$JSON_FILE" origin/devel
 # Path to tests_to_run.json in the same directory
 TEST_FILE="tests_to_run.json"
 
-command=$(python3 -c "
+# Configuration: Adjust these defaults as needed
+# Number of tests per ctest command (can be overridden with CTEST_CHUNK_SIZE env var)
+DEFAULT_CHUNK_SIZE=10
+# Whether to stop on first failure (can be overridden with CTEST_FAIL_FAST env var)
+DEFAULT_FAIL_FAST=false
+
+# Split tests into chunks and run multiple ctest commands
+# Export variables so Python subprocess can access them
+export CHUNK_SIZE=${CTEST_CHUNK_SIZE:-$DEFAULT_CHUNK_SIZE}
+export FAIL_FAST=${CTEST_FAIL_FAST:-$DEFAULT_FAIL_FAST}
+
+python3 -c "
 import json
 import os
+import sys
+import subprocess
+
+CHUNK_SIZE = int(os.environ.get('CHUNK_SIZE', '10'))
+FAIL_FAST = os.environ.get('FAIL_FAST', 'false').lower() == 'true'
+
 with open('$TEST_FILE', 'r') as f:
     data = json.load(f)
     tests = data.get('tests_to_run', [])
-    if tests:
-        # Extract just the filename after the last '/'
-        clean_tests = [os.path.basename(test) for test in tests]
-        print('ctest --output-on-failure -R \"' + '|'.join(clean_tests) + '\"')
-    else:
+    
+    if not tests:
         print('# No tests to run')
-")
+        sys.exit(0)
+    
+    # Extract just the filename after the last '/'
+    clean_tests = [os.path.basename(test) for test in tests]
+    
+    total_tests = len(clean_tests)
+    total_chunks = (total_tests + CHUNK_SIZE - 1) // CHUNK_SIZE
+    
+    print(f'# Total tests to run: {total_tests}')
+    print(f'# Running in {total_chunks} chunk(s) of up to {CHUNK_SIZE} tests each')
+    print(f'# Fail-fast mode: {FAIL_FAST}')
+    print()
+    
+    failed_chunks = []
+    
+    # Split into chunks
+    for i in range(0, total_tests, CHUNK_SIZE):
+        chunk = clean_tests[i:i+CHUNK_SIZE]
+        chunk_num = (i // CHUNK_SIZE) + 1
+        
+        print(f'Running test chunk {chunk_num}/{total_chunks} ({len(chunk)} tests)...')
+        sys.stdout.flush()
+        
+        # Run ctest command, don't raise exception on failure
+        cmd = ['ctest', '--output-on-failure', '-R', '|'.join(chunk)]
+        try:
+            result = subprocess.run(cmd, cwd='$BUILD_DIR', check=False)
+            
+            if result.returncode != 0:
+                failed_chunks.append(chunk_num)
+                print(f'WARNING: Chunk {chunk_num} had test failures (exit code: {result.returncode})')
+                
+                # If fail-fast is enabled, exit immediately
+                if FAIL_FAST:
+                    print(f'FAIL-FAST: Stopping at chunk {chunk_num} due to failures')
+                    sys.exit(1)
+        except Exception as e:
+            print(f'ERROR: Failed to run chunk {chunk_num}: {e}')
+            failed_chunks.append(chunk_num)
+            if FAIL_FAST:
+                sys.exit(1)
+        
+        print()
+        sys.stdout.flush()
+    
+    # Print summary
+    print('=' * 60)
+    if failed_chunks:
+        print(f'SUMMARY: {len(failed_chunks)} of {total_chunks} chunk(s) had failures: {failed_chunks}')
+        print('=' * 60)
+        sys.exit(1)
+    else:
+        print(f'SUMMARY: All {total_chunks} chunk(s) passed successfully!')
+        print('=' * 60)
+        sys.exit(0)
+" 
+PYTHON_EXIT=$?
 
-echo "$command"
-
-eval "$command"
+exit $PYTHON_EXIT
diff --git a/script/ninja_json_converter.py b/script/ninja_json_converter.py
index e68f7ccfa3..5e974cf730 100644
--- a/script/ninja_json_converter.py
+++ b/script/ninja_json_converter.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
 """
diff --git a/script/process_perf_data.py b/script/process_perf_data.py
index b35ba64041..5f81512a4c 100644
--- a/script/process_perf_data.py
+++ b/script/process_perf_data.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 import os
 import io
 import argparse
diff --git a/script/process_perf_data.sh b/script/process_perf_data.sh
index 50c84924f5..4786ddded0 100755
--- a/script/process_perf_data.sh
+++ b/script/process_perf_data.sh
@@ -1,4 +1,7 @@
 #!/bin/bash 
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 #
 # in order to run this script you'd need the following python packages:
 
diff --git a/script/process_qa_data.sh b/script/process_qa_data.sh
index 420453cddc..d56ef5c1ec 100755
--- a/script/process_qa_data.sh
+++ b/script/process_qa_data.sh
@@ -1,4 +1,7 @@
 #!/bin/bash 
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 #
 # in order to run this script you'd need the following python packages:
 
diff --git a/script/profile_batched_gemm.sh b/script/profile_batched_gemm.sh
index f90baaed68..bb7d61deec 100755
--- a/script/profile_batched_gemm.sh
+++ b/script/profile_batched_gemm.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
  
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_gemm.sh b/script/profile_gemm.sh
index b88159e74d..f766ca50fa 100755
--- a/script/profile_gemm.sh
+++ b/script/profile_gemm.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_gemm_bilinear.sh b/script/profile_gemm_bilinear.sh
index e6edefae85..057d7d7e49 100755
--- a/script/profile_gemm_bilinear.sh
+++ b/script/profile_gemm_bilinear.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
 DRIVER="../build/bin/ckProfiler"
diff --git a/script/profile_grouped_conv_bwd_data.sh b/script/profile_grouped_conv_bwd_data.sh
index a1d2f450c9..3805ed86cd 100755
--- a/script/profile_grouped_conv_bwd_data.sh
+++ b/script/profile_grouped_conv_bwd_data.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
  
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_grouped_conv_bwd_weight.sh b/script/profile_grouped_conv_bwd_weight.sh
index e3652202d4..146431621c 100755
--- a/script/profile_grouped_conv_bwd_weight.sh
+++ b/script/profile_grouped_conv_bwd_weight.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
  
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_grouped_conv_fwd.sh b/script/profile_grouped_conv_fwd.sh
index 9a974525ad..8491aecf9e 100755
--- a/script/profile_grouped_conv_fwd.sh
+++ b/script/profile_grouped_conv_fwd.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
  
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_grouped_conv_fwd_outelementop.sh b/script/profile_grouped_conv_fwd_outelementop.sh
index ac444a25c2..a0df8cd4c5 100755
--- a/script/profile_grouped_conv_fwd_outelementop.sh
+++ b/script/profile_grouped_conv_fwd_outelementop.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_grouped_gemm.sh b/script/profile_grouped_gemm.sh
index 8adb7c81ac..fe452d5cab 100755
--- a/script/profile_grouped_gemm.sh
+++ b/script/profile_grouped_gemm.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
  
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_mixed_gemm.sh b/script/profile_mixed_gemm.sh
index 383c7ea36e..a867bf3a77 100755
--- a/script/profile_mixed_gemm.sh
+++ b/script/profile_mixed_gemm.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_onnx_gemm.sh b/script/profile_onnx_gemm.sh
index c2721e7f59..ea18fc761e 100755
--- a/script/profile_onnx_gemm.sh
+++ b/script/profile_onnx_gemm.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
 DRIVER="../build/bin/ckProfiler"
diff --git a/script/profile_permute_scale.sh b/script/profile_permute_scale.sh
index 945d10f47b..31d6a06c5e 100755
--- a/script/profile_permute_scale.sh
+++ b/script/profile_permute_scale.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_reduce_no_index.sh b/script/profile_reduce_no_index.sh
index 66bfe1dcd3..3bae07906b 100755
--- a/script/profile_reduce_no_index.sh
+++ b/script/profile_reduce_no_index.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 DRIVER="../build/bin/ckProfiler"
 VERIFY="-v $1"
 INIT=$2
diff --git a/script/profile_reduce_with_index.sh b/script/profile_reduce_with_index.sh
index 43543f4430..943a590528 100755
--- a/script/profile_reduce_with_index.sh
+++ b/script/profile_reduce_with_index.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 DRIVER="../build/bin/ckProfiler"
 VERIFY="-v $1"
 INIT=$2
diff --git a/script/profile_resnet50.sh b/script/profile_resnet50.sh
index b55cb2ccef..ec6b32c0c8 100755
--- a/script/profile_resnet50.sh
+++ b/script/profile_resnet50.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/profile_splitK_gemm.sh b/script/profile_splitK_gemm.sh
index d62f0e4753..843d59c918 100755
--- a/script/profile_splitK_gemm.sh
+++ b/script/profile_splitK_gemm.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 ## GPU visibility
 export HIP_VISIBLE_DEVICES=0
diff --git a/script/remod_for_ck_tile.py b/script/remod_for_ck_tile.py
index 7601c9d619..feb50dc290 100755
--- a/script/remod_for_ck_tile.py
+++ b/script/remod_for_ck_tile.py
@@ -1,3 +1,6 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 import os
 
 root_dir = os.getcwd()
diff --git a/script/remove_exec_bit.sh b/script/remove_exec_bit.sh
index 2926683d6a..0b3ca80422 100755
--- a/script/remove_exec_bit.sh
+++ b/script/remove_exec_bit.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# Copyright © Advanced Micro Devices, Inc., or its affiliates.
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
 
 for file in $(git diff --cached --name-only --diff-filter=ACM | grep -E '\.(cpp|hpp|txt|inc)$'); do
diff --git a/script/run_ck_profiler_gemm_with_csv_shapes.py b/script/run_ck_profiler_gemm_with_csv_shapes.py
index eb0eb9c920..2590e3942e 100644
--- a/script/run_ck_profiler_gemm_with_csv_shapes.py
+++ b/script/run_ck_profiler_gemm_with_csv_shapes.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 # -*- coding: utf-8 -*-
 
diff --git a/script/run_full_performance_tests.sh b/script/run_full_performance_tests.sh
index 508200b21a..55740da097 100755
--- a/script/run_full_performance_tests.sh
+++ b/script/run_full_performance_tests.sh
@@ -1,4 +1,7 @@
 #!/bin/bash 
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
 # you would also need to set up some environment variables in order to 
diff --git a/script/run_gemm_performance_tests.sh b/script/run_gemm_performance_tests.sh
index 12adad30f8..c72b2a760b 100755
--- a/script/run_gemm_performance_tests.sh
+++ b/script/run_gemm_performance_tests.sh
@@ -1,4 +1,7 @@
 #!/bin/bash 
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
 # run the script as "./run_gemm_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name> <arch>
diff --git a/script/run_performance_tests.sh b/script/run_performance_tests.sh
index 4e13b59d34..9163e6d693 100755
--- a/script/run_performance_tests.sh
+++ b/script/run_performance_tests.sh
@@ -1,4 +1,7 @@
 #!/bin/bash 
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 #
 # in order to run this script you'd first need to build the ckProfiler executable in ../build/bin/
 # run the script as "./run_performance_tests.sh <verification> <tag for your test environment> <branch name> <node name>
diff --git a/script/sccache_wrapper.sh b/script/sccache_wrapper.sh
index b0ec08de45..1a7e37881e 100755
--- a/script/sccache_wrapper.sh
+++ b/script/sccache_wrapper.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 set -e
 COMPILERS_HASH_DIR=${COMPILERS_HASH_DIR:-"/tmp/.sccache"}
 SCCACHE_EXTRAFILES=${SCCACHE_EXTRAFILES:-"${COMPILERS_HASH_DIR}/rocm_compilers_hash_file"}
@@ -52,5 +55,22 @@ if [ "${ENFORCE_REDIS}" == "true" ]; then
 fi
 setup_rocm_compilers_hash_file
 $SCCACHE_BIN --version
+echo "=== Starting sccache server at $(date) ==="
 $SCCACHE_BIN --start-server
 
+# Log initial sccache statistics
+echo "=== Initial sccache statistics ==="
+$SCCACHE_BIN --show-stats || echo "Could not get initial stats"
+
+# Test Redis connectivity and performance
+echo "=== Testing Redis connectivity ==="
+start_time=$(date +%s%N)
+redis-cli -u ${SCCACHE_REDIS} ping || echo "Redis ping failed"
+end_time=$(date +%s%N)
+latency=$(( (end_time - start_time) / 1000000 ))
+echo "Redis ping latency: ${latency}ms"
+
+# Check Redis memory status
+echo "=== Redis memory status ==="
+redis-cli -u ${SCCACHE_REDIS} info memory | grep -E "(used_memory|maxmemory|evicted_keys)" || echo "Could not get Redis memory info"
+
diff --git a/script/test_convnd_fwd.sh b/script/test_convnd_fwd.sh
index 8bd2c2fc33..d716caac15 100644
--- a/script/test_convnd_fwd.sh
+++ b/script/test_convnd_fwd.sh
@@ -1,4 +1,7 @@
 #!/usr/bin/env bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 # set -e
 
diff --git a/script/test_reduce_no_index.sh b/script/test_reduce_no_index.sh
index b956303837..717a872c45 100755
--- a/script/test_reduce_no_index.sh
+++ b/script/test_reduce_no_index.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 ## The following will be used for CI
 
diff --git a/script/uninstall_precommit.sh b/script/uninstall_precommit.sh
index b0d4d15166..394425acdd 100755
--- a/script/uninstall_precommit.sh
+++ b/script/uninstall_precommit.sh
@@ -1 +1,4 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 pre-commit uninstall
diff --git a/test/ck_tile/gemm/CMakeLists.txt b/test/ck_tile/gemm/CMakeLists.txt
index 96c071cbc4..c08ab33b91 100644
--- a/test/ck_tile/gemm/CMakeLists.txt
+++ b/test/ck_tile/gemm/CMakeLists.txt
@@ -22,6 +22,12 @@ else()
     message(DEBUG "Skipping ck_tile_gemm tests for current target")
 endif()
 
+
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
+    add_gtest_executable(test_gemm_pipeline_compiler test_gemm_pipeline_compiler.cpp)
+    target_compile_options(test_gemm_pipeline_compiler PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})
+endif()
+
 if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     add_gtest_executable(test_ck_tile_gemm_pipeline_universal_fp8 test_gemm_pipeline_universal_fp8.cpp)
     add_gtest_executable(test_ck_tile_gemm_pipeline_universal_bf8 test_gemm_pipeline_universal_bf8.cpp)
diff --git a/test/ck_tile/gemm/test_gemm_pipeline_compiler.cpp b/test/ck_tile/gemm/test_gemm_pipeline_compiler.cpp
new file mode 100644
index 0000000000..bf39e0b552
--- /dev/null
+++ b/test/ck_tile/gemm/test_gemm_pipeline_compiler.cpp
@@ -0,0 +1,900 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "test_gemm_pipeline_kernel_types.hpp"
+#include "test_gemm_pipeline_util.hpp"
+#include "gtest/gtest.h"
+
+// ============================================================================
+// Comprehensive GEMM Compiler Validation Test Suite
+// This file consolidates all GEMM pipeline tests for compiler validation
+// Covers essential combinations of data types, layouts, and pipeline types
+// ============================================================================
+
+// ----------------------------------------------------------------------------
+// Test Class Definitions for Different Pipeline Types
+// ----------------------------------------------------------------------------
+
+template <typename T>
+class TestGemmMem : public TestCkTileGemmPipeline<T, TestGemmMem<T>>
+{
+};
+
+#if defined(CK_TILE_USE_WMMA)
+template <typename T>
+class TestGemmMemWmma : public TestCkTileGemmPipeline<T, TestGemmMemWmma<T>>
+{
+};
+#endif
+
+template <typename T>
+class TestGemmCompV3 : public TestCkTileGemmPipeline<T, TestGemmCompV3<T>>
+{
+};
+
+#if defined(CK_TILE_USE_WMMA)
+template <typename T>
+class TestGemmCompV3Wmma : public TestCkTileGemmPipeline<T, TestGemmCompV3Wmma<T>>
+{
+};
+#endif
+
+template <typename T>
+class TestGemmCompV4 : public TestCkTileGemmPipeline<T, TestGemmCompV4<T>>
+{
+};
+
+#if defined(CK_TILE_USE_WMMA)
+template <typename T>
+class TestGemmCompV4Wmma : public TestCkTileGemmPipeline<T, TestGemmCompV4Wmma<T>>
+{
+};
+#endif
+
+template <typename T>
+class TestGemmCompV6 : public TestCkTileGemmPipeline<T, TestGemmCompV6<T>>
+{
+};
+
+template <typename T>
+class TestGemmPersistent : public TestCkTileGemmPipeline<T, TestGemmPersistent<T>>
+{
+};
+
+#if defined(CK_TILE_USE_WMMA)
+template <typename T>
+class TestGemmPersistentWmma : public TestCkTileGemmPipeline<T, TestGemmPersistentWmma<T>>
+{
+};
+#endif
+
+// ----------------------------------------------------------------------------
+// Type Definitions for Each Pipeline Configuration
+// ----------------------------------------------------------------------------
+
+// Memory Pipeline Types
+using MemTestTypes = ::testing::Types<
+    // Parameters: ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType,
+    // M_BlockSize, N_BlockSize, K_BlockSize, M_TileSize, N_TileSize, K_TileSize, Scheduler,
+    // PipelineType
+
+    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Interwave, Mem>,
+    std::tuple<Row, Row, Row, BF16, BF16, F32, BF16, I64, I64, I32, I16, I16, I16, Interwave, Mem>>;
+
+#if defined(CK_TILE_USE_WMMA)
+// Memory Pipeline WMMA Types
+using MemWmmaTestTypes = ::testing::Types<
+    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Interwave, Mem>,
+    std::tuple<Row, Row, Row, BF16, BF16, F32, BF16, I64, I64, I32, I16, I16, I16, Interwave, Mem>>;
+#endif
+
+// CompV3 Pipeline Types
+using CompV3TestTypes = ::testing::Types<
+    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV3>,
+    std::tuple<Row,
+               Row,
+               Row,
+               BF16,
+               BF16,
+               F32,
+               F16,
+               I64,
+               I64,
+               I32,
+               I16,
+               I16,
+               I16,
+               Intrawave,
+               CompV3>>;
+
+#if defined(CK_TILE_USE_WMMA)
+// CompV3 Pipeline WMMA Types
+using CompV3WmmaTestTypes = ::testing::Types<
+    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV3>,
+    std::tuple<Row,
+               Row,
+               Row,
+               BF16,
+               BF16,
+               F32,
+               F16,
+               I64,
+               I64,
+               I32,
+               I16,
+               I16,
+               I16,
+               Intrawave,
+               CompV3>>;
+#endif
+
+// CompV4 Pipeline Types
+using CompV4TestTypes = ::testing::Types<
+    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV4>,
+    std::tuple<Row,
+               Row,
+               Row,
+               BF16,
+               BF16,
+               F32,
+               F16,
+               I64,
+               I64,
+               I32,
+               I16,
+               I16,
+               I16,
+               Intrawave,
+               CompV4>>;
+
+#if defined(CK_TILE_USE_WMMA)
+// CompV4 Pipeline WMMA Types
+using CompV4WmmaTestTypes = ::testing::Types<
+    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV4>,
+    std::tuple<Row,
+               Row,
+               Row,
+               BF16,
+               BF16,
+               F32,
+               F16,
+               I64,
+               I64,
+               I32,
+               I16,
+               I16,
+               I16,
+               Intrawave,
+               CompV4>>;
+#endif
+
+// CompV6 Pipeline Types
+using CompV6TestTypes = ::testing::Types<
+    std::tuple<Row, Row, Row, F16, F16, F32, F16, I64, I64, I32, I16, I16, I16, Intrawave, CompV6>,
+    std::tuple<Row,
+               Row,
+               Row,
+               BF16,
+               BF16,
+               F32,
+               F16,
+               I64,
+               I64,
+               I32,
+               I16,
+               I16,
+               I16,
+               Intrawave,
+               CompV6>>;
+
+// Persistent CompV3 Pipeline Types
+using PersistentTestTypes = ::testing::Types<std::tuple<Row,
+                                                        Col,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        F32,
+                                                        F16,
+                                                        I64,
+                                                        I64,
+                                                        I32,
+                                                        I16,
+                                                        I16,
+                                                        I16,
+                                                        Intrawave,
+                                                        CompV3,
+                                                        Persistent>,
+                                             std::tuple<Row,
+                                                        Col,
+                                                        Row,
+                                                        F16,
+                                                        F16,
+                                                        F32,
+                                                        F16,
+                                                        I64,
+                                                        I64,
+                                                        I32,
+                                                        I16,
+                                                        I16,
+                                                        I16,
+                                                        Intrawave,
+                                                        CompV3,
+                                                        NonPersistent>>;
+
+#if defined(CK_TILE_USE_WMMA)
+// Persistent CompV3 Pipeline WMMA Types
+using PersistentWmmaTestTypes = ::testing::Types<std::tuple<Row,
+                                                            Col,
+                                                            Row,
+                                                            F16,
+                                                            F16,
+                                                            F32,
+                                                            F16,
+                                                            I64,
+                                                            I64,
+                                                            I32,
+                                                            I16,
+                                                            I16,
+                                                            I16,
+                                                            Intrawave,
+                                                            CompV3,
+                                                            Persistent>,
+                                                 std::tuple<Row,
+                                                            Col,
+                                                            Row,
+                                                            F16,
+                                                            F16,
+                                                            F32,
+                                                            F16,
+                                                            I64,
+                                                            I64,
+                                                            I32,
+                                                            I16,
+                                                            I16,
+                                                            I16,
+                                                            Intrawave,
+                                                            CompV3,
+                                                            NonPersistent>>;
+#endif
+
+// ----------------------------------------------------------------------------
+// Test Suite Registrations
+// ----------------------------------------------------------------------------
+
+TYPED_TEST_SUITE(TestGemmMem, MemTestTypes);
+#if defined(CK_TILE_USE_WMMA)
+TYPED_TEST_SUITE(TestGemmMemWmma, MemWmmaTestTypes);
+#endif
+TYPED_TEST_SUITE(TestGemmCompV3, CompV3TestTypes);
+#if defined(CK_TILE_USE_WMMA)
+TYPED_TEST_SUITE(TestGemmCompV3Wmma, CompV3WmmaTestTypes);
+#endif
+TYPED_TEST_SUITE(TestGemmCompV4, CompV4TestTypes);
+#if defined(CK_TILE_USE_WMMA)
+TYPED_TEST_SUITE(TestGemmCompV4Wmma, CompV4WmmaTestTypes);
+#endif
+TYPED_TEST_SUITE(TestGemmCompV6, CompV6TestTypes);
+TYPED_TEST_SUITE(TestGemmPersistent, PersistentTestTypes);
+#if defined(CK_TILE_USE_WMMA)
+TYPED_TEST_SUITE(TestGemmPersistentWmma, PersistentWmmaTestTypes);
+#endif
+
+// ============================================================================
+// Memory Pipeline Tests (Mem)
+// ============================================================================
+
+#define TEST_SUITE_NAME TestGemmMem
+
+TYPED_TEST(TEST_SUITE_NAME, SmallM_SingleRow)
+{
+    std::vector<int> Ms{1};
+    constexpr int N = 1024;
+    constexpr int K = TestFixture::K_Tile * 2;
+
+    for(int M : Ms)
+    {
+        if constexpr(std::is_same_v<typename TestFixture::ALayout,
+                                    ck_tile::tensor_layout::gemm::ColumnMajor>)
+        {
+            EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+        }
+        else
+        {
+            this->Run(M, N, K);
+        }
+    }
+}
+
+TYPED_TEST(TEST_SUITE_NAME, SingleTile)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, ExactlyTwoTiles_M)
+{
+    this->Run(TestFixture::M_Tile * 2, TestFixture::N_Tile, TestFixture::K_Tile * 2);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, ExactlyTwoTiles_N)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile * 2, TestFixture::K_Tile * 2);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, ExactlyTwoTiles_K)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile * 2);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Regular_512x1024x512)
+{
+    constexpr int M = 512;
+    constexpr int N = 1024;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Square_1024x1024x1024)
+{
+    constexpr int M = 1024;
+    constexpr int N = 1024;
+    constexpr int K = 1024;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_2048x2048x2048)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, VeryLargeMatrix_4096x4096x4096)
+{
+    constexpr int M = 4096;
+    constexpr int N = 4096;
+    constexpr int K = 4096;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, TallSkinny_4096x128x1024)
+{
+    constexpr int M = 4096;
+    constexpr int N = 128;
+    constexpr int K = 1024;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, ShortWide_128x4096x1024)
+{
+    constexpr int M = 128;
+    constexpr int N = 4096;
+    constexpr int K = 1024;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, DeepNarrow_2048x2048x8192)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 8192;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StressTest_ExtremelyTallMatrix)
+{
+    constexpr int M = 16384;
+    constexpr int N = 64;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StressTest_ExtremelyWideMatrix)
+{
+    constexpr int M = 64;
+    constexpr int N = 16384;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StressTest_VeryDeepK)
+{
+    constexpr int M = 1024;
+    constexpr int N = 1024;
+    constexpr int K = 16384;
+    this->Run(M, N, K);
+}
+
+#undef TEST_SUITE_NAME
+
+#if defined(CK_TILE_USE_WMMA)
+// ============================================================================
+// Memory Pipeline Tests with WMMA
+// ============================================================================
+
+#define TEST_SUITE_NAME TestGemmMemWmma
+
+TYPED_TEST(TEST_SUITE_NAME, SingleTile_WMMA)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Regular_WMMA)
+{
+    constexpr int M = 512;
+    constexpr int N = 1024;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_WMMA)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+    this->Run(M, N, K);
+}
+
+#undef TEST_SUITE_NAME
+#endif // CK_TILE_USE_WMMA
+
+// ============================================================================
+// Compute V3 Pipeline Tests
+// ============================================================================
+
+#define TEST_SUITE_NAME TestGemmCompV3
+
+TYPED_TEST(TEST_SUITE_NAME, SmallM_CompV3)
+{
+    std::vector<int> Ms{1, 2};
+    constexpr int N = 1024;
+    std::vector<int> Ks;
+    for(auto K_count : {2, 4})
+    {
+        Ks.push_back(K_count * TestFixture::K_Tile);
+    }
+
+    for(int M : Ms)
+    {
+        for(int K : Ks)
+        {
+            if constexpr(std::is_same_v<typename TestFixture::ALayout,
+                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+            }
+            else
+            {
+                this->Run(M, N, K);
+            }
+        }
+    }
+}
+
+TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV3)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MidLargeM_CompV3)
+{
+    std::vector<int> Ms{127, 255};
+    constexpr int N = 1024;
+
+    std::vector<int> Ks;
+    for(auto K_count : {2, 4})
+    {
+        Ks.push_back(K_count * TestFixture::K_Tile);
+    }
+
+    constexpr int VecLoadSize = (std::is_same_v<typename TestFixture::ADataType, ck_tile::fp8_t> ||
+                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::bf8_t> ||
+                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::int8_t>)
+                                    ? 16
+                                    : 8;
+
+    for(int M : Ms)
+    {
+        for(int K : Ks)
+        {
+            if constexpr(std::is_same_v<typename TestFixture::ALayout,
+                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                if(M % VecLoadSize == 0)
+                {
+                    this->Run(M, N, K);
+                }
+                else
+                {
+                    EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+                }
+            }
+            else
+            {
+                this->Run(M, N, K);
+            }
+        }
+    }
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Regular_CompV3)
+{
+    constexpr int M = 512;
+    constexpr int N = 1024;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV3)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, BatchedSmall_CompV3)
+{
+    constexpr int M = 256;
+    constexpr int N = 256;
+    constexpr int K = 256;
+    this->Run(M, N, K);
+}
+
+#undef TEST_SUITE_NAME
+
+#if defined(CK_TILE_USE_WMMA)
+// ============================================================================
+// Compute V3 Pipeline Tests with WMMA
+// ============================================================================
+
+#define TEST_SUITE_NAME TestGemmCompV3Wmma
+
+TYPED_TEST(TEST_SUITE_NAME, SmallM_CompV3Wmma)
+{
+    std::vector<int> Ms{1, 2};
+    constexpr int N = 1024;
+    std::vector<int> Ks;
+    for(auto K_count : {2, 4})
+    {
+        Ks.push_back(K_count * TestFixture::K_Tile);
+    }
+
+    for(int M : Ms)
+    {
+        for(int K : Ks)
+        {
+            if constexpr(std::is_same_v<typename TestFixture::ALayout,
+                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+            }
+            else
+            {
+                this->Run(M, N, K);
+            }
+        }
+    }
+}
+
+TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV3Wmma)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Regular_CompV3Wmma)
+{
+    constexpr int M = 512;
+    constexpr int N = 1024;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV3Wmma)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+    this->Run(M, N, K);
+}
+
+#undef TEST_SUITE_NAME
+#endif // CK_TILE_USE_WMMA
+
+// ============================================================================
+// Compute V4 Pipeline Tests
+// ============================================================================
+
+#define TEST_SUITE_NAME TestGemmCompV4
+
+TYPED_TEST(TEST_SUITE_NAME, SmallM_CompV4)
+{
+    std::vector<int> Ms{1, 2};
+    constexpr int N = 1024;
+    std::vector<int> Ks;
+    for(auto K_count : {2, 4})
+    {
+        Ks.push_back(K_count * TestFixture::K_Tile);
+    }
+
+    for(int M : Ms)
+    {
+        for(int K : Ks)
+        {
+            if constexpr(std::is_same_v<typename TestFixture::ALayout,
+                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+            }
+            else
+            {
+                this->Run(M, N, K);
+            }
+        }
+    }
+}
+
+TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV4)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Regular_CompV4)
+{
+    constexpr int M = 512;
+    constexpr int N = 1024;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV4)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+    this->Run(M, N, K);
+}
+
+#undef TEST_SUITE_NAME
+
+#if defined(CK_TILE_USE_WMMA)
+// ============================================================================
+// Compute V4 Pipeline Tests with WMMA
+// ============================================================================
+
+#define TEST_SUITE_NAME TestGemmCompV4Wmma
+
+TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV4Wmma)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Regular_CompV4Wmma)
+{
+    constexpr int M = 512;
+    constexpr int N = 1024;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV4Wmma)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+    this->Run(M, N, K);
+}
+
+#undef TEST_SUITE_NAME
+#endif // CK_TILE_USE_WMMA
+
+// ============================================================================
+// Compute V6 Pipeline Tests
+// ============================================================================
+
+#define TEST_SUITE_NAME TestGemmCompV6
+
+TYPED_TEST(TEST_SUITE_NAME, SmallM_CompV6)
+{
+    std::vector<int> Ms{1, 2};
+    constexpr int N = 1024;
+    std::vector<int> Ks;
+    for(auto K_count : {2, 4})
+    {
+        Ks.push_back(K_count * TestFixture::K_Tile);
+    }
+
+    for(int M : Ms)
+    {
+        for(int K : Ks)
+        {
+            if constexpr(std::is_same_v<typename TestFixture::ALayout,
+                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+            }
+            else
+            {
+                this->Run(M, N, K);
+            }
+        }
+    }
+}
+
+TYPED_TEST(TEST_SUITE_NAME, SingleTile_CompV6)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, MidLargeM_CompV6)
+{
+    std::vector<int> Ms{127, 255};
+    constexpr int N = 1024;
+
+    std::vector<int> Ks;
+    for(auto K_count : {2, 4})
+    {
+        Ks.push_back(K_count * TestFixture::K_Tile);
+    }
+
+    constexpr int VecLoadSize = (std::is_same_v<typename TestFixture::ADataType, ck_tile::fp8_t> ||
+                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::bf8_t> ||
+                                 std::is_same_v<typename TestFixture::ADataType, ck_tile::int8_t>)
+                                    ? 16
+                                    : 8;
+
+    for(int M : Ms)
+    {
+        for(int K : Ks)
+        {
+            if constexpr(std::is_same_v<typename TestFixture::ALayout,
+                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                if(M % VecLoadSize == 0)
+                {
+                    this->Run(M, N, K);
+                }
+                else
+                {
+                    EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+                }
+            }
+            else
+            {
+                this->Run(M, N, K);
+            }
+        }
+    }
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Regular_CompV6)
+{
+    constexpr int M = 512;
+    constexpr int N = 1024;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_CompV6)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+    this->Run(M, N, K);
+}
+
+#undef TEST_SUITE_NAME
+
+// ============================================================================
+// Persistent Kernel Tests
+// ============================================================================
+
+#define TEST_SUITE_NAME TestGemmPersistent
+
+TYPED_TEST(TEST_SUITE_NAME, SmallM_Persistent)
+{
+    std::vector<int> Ms{1, 2};
+    constexpr int N = 1024;
+    std::vector<int> Ks;
+    for(auto K_count : {2, 4})
+    {
+        Ks.push_back(K_count * TestFixture::K_Tile);
+    }
+
+    for(int M : Ms)
+    {
+        for(int K : Ks)
+        {
+            if constexpr(std::is_same_v<typename TestFixture::ALayout,
+                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+            }
+            else
+            {
+                this->Run(M, N, K);
+            }
+        }
+    }
+}
+
+TYPED_TEST(TEST_SUITE_NAME, SingleTile_Persistent)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Regular_Persistent)
+{
+    constexpr int M = 512;
+    constexpr int N = 1024;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_Persistent)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+    this->Run(M, N, K);
+}
+
+#undef TEST_SUITE_NAME
+
+#if defined(CK_TILE_USE_WMMA)
+// ============================================================================
+// Persistent Kernel Tests with WMMA
+// ============================================================================
+
+#define TEST_SUITE_NAME TestGemmPersistentWmma
+
+TYPED_TEST(TEST_SUITE_NAME, SmallM_PersistentWmma)
+{
+    std::vector<int> Ms{1, 2};
+    constexpr int N = 1024;
+    std::vector<int> Ks;
+    for(auto K_count : {2, 4})
+    {
+        Ks.push_back(K_count * TestFixture::K_Tile);
+    }
+
+    for(int M : Ms)
+    {
+        for(int K : Ks)
+        {
+            if constexpr(std::is_same_v<typename TestFixture::ALayout,
+                                        ck_tile::tensor_layout::gemm::ColumnMajor>)
+            {
+                EXPECT_THROW((this->Run(M, N, K)), std::runtime_error);
+            }
+            else
+            {
+                this->Run(M, N, K);
+            }
+        }
+    }
+}
+
+TYPED_TEST(TEST_SUITE_NAME, SingleTile_PersistentWmma)
+{
+    this->Run(TestFixture::M_Tile, TestFixture::N_Tile, TestFixture::K_Tile);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, Regular_PersistentWmma)
+{
+    constexpr int M = 512;
+    constexpr int N = 1024;
+    constexpr int K = 512;
+    this->Run(M, N, K);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, LargeMatrix_PersistentWmma)
+{
+    constexpr int M = 2048;
+    constexpr int N = 2048;
+    constexpr int K = 2048;
+    this->Run(M, N, K);
+}
+
+#undef TEST_SUITE_NAME
+#endif // CK_TILE_USE_WMMA
diff --git a/test/ck_tile/gemm_block_scale/CMakeLists.txt b/test/ck_tile/gemm_block_scale/CMakeLists.txt
index 3a49e69c37..1c4a25c8bd 100644
--- a/test/ck_tile/gemm_block_scale/CMakeLists.txt
+++ b/test/ck_tile/gemm_block_scale/CMakeLists.txt
@@ -5,7 +5,7 @@ endif()
 
 list(APPEND TEST_GEMM_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
 
-if(GPU_TARGETS MATCHES "gfx94" OR GPU_TARGETS MATCHES "gfx95")
+if(GPU_TARGETS MATCHES "gfx94|gfx95|gfx12")
     # Typed Test Suite for GEMM Quantization
     add_gtest_executable(test_tile_gemm_quant_typed 
         test_gemm_quant_typed.cpp
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_base.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_base.hpp
index 6454101daf..6226a2de9e 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_base.hpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_base.hpp
@@ -69,7 +69,15 @@ class TestCkTileGemmQuantBase : public ::testing::Test
         constexpr bool kPadM = false;
         constexpr bool kPadN = false;
         constexpr bool kPadK = false;
-
+        // WP pipeline requires per-thread tile size aligned to Problem::VectorLoadSize.
+        // static_assert((WG::kM * WG::kK * sizeof(ADataType) * MIterPerWarp / WaveSize) %
+        // VectorLoadSize == 0). gfx9 cards match the requirements but it fails on gfx12. so we only
+        // need to check the limitation on RDNA cards, i.e. assume wave size is 32.
+        constexpr ck_tile::index_t WaveSize     = 32;
+        constexpr ck_tile::index_t MIterPerWarp = M_Tile / (M_Warp * M_Warp_Tile);
+        constexpr bool SupportVectorSize16 =
+            (M_Warp_Tile * K_Warp_Tile * sizeof(ADataType) * MIterPerWarp / WaveSize) % 16 == 0;
+        constexpr int VectorSize = PreshuffleB ? (SupportVectorSize16 ? 16 : 8) : 16;
         using CodegenGemmShape =
             ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
                                    ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
@@ -89,7 +97,9 @@ class TestCkTileGemmQuantBase : public ::testing::Test
                                                                ALayout,
                                                                BLayout,
                                                                GemmConfig::TransposeC,
-                                                               DoubleSmemBuffer>;
+                                                               DoubleSmemBuffer,
+                                                               false,
+                                                               VectorSize>;
 
         // Let the derived class create the appropriate pipeline and epilogue
         static_cast<Derived*>(this)
diff --git a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp
index cabc0ec02c..5aac095514 100644
--- a/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp
+++ b/test/ck_tile/gemm_block_scale/test_gemm_quant_fixtures.hpp
@@ -7,6 +7,16 @@
 #include "ck_tile/host/permute_pk_int4.hpp"
 #include "ck_tile/host/tensor_shuffle_utils.hpp"
 
+template <bool is_8bit>
+constexpr ck_tile::index_t get_k_warp_tile()
+{
+#if CK_TILE_USE_WMMA
+    return 16;
+#else
+    return is_8bit ? 64 : 32;
+#endif
+}
+
 struct GemmConfigBase
 {
     static constexpr bool kPadM = false;
@@ -40,7 +50,7 @@ struct GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = 32;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<false>();
 };
 
 struct GemmConfigPreshuffleQuant : public GemmConfigBase
@@ -75,7 +85,7 @@ struct GemmConfigPreshuffleBDecode : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<true>();
 };
 
 struct GemmConfigPreshuffleBPrefill : public GemmConfigBase
@@ -94,7 +104,7 @@ struct GemmConfigPreshuffleBPrefill : public GemmConfigBase
 
     static constexpr ck_tile::index_t M_Warp_Tile = 16;
     static constexpr ck_tile::index_t N_Warp_Tile = 16;
-    static constexpr ck_tile::index_t K_Warp_Tile = 64;
+    static constexpr ck_tile::index_t K_Warp_Tile = get_k_warp_tile<true>();
 };
 
 struct GemmConfigPreshuffleBPrefillTiledPermuteN : public GemmConfigPreshuffleBPrefill
@@ -132,7 +142,7 @@ class TestCkTileGemmAQuant : public TestCkTileGemmQuantBase<Tuple, TestCkTileGem
     {
         const ck_tile::index_t stride_A = K;
         const ck_tile::index_t stride_B = K;
-        const ck_tile::index_t stride_C = M;
+        const ck_tile::index_t stride_C = N;
 
         // AQuant uses grouped quantization for A matrix
         const ck_tile::index_t AQK = ck_tile::integer_divide_ceil(K, QuantGroupSize::kK);
@@ -373,7 +383,7 @@ class TestCkTileGemmBQuant : public TestCkTileGemmQuantBase<Tuple, TestCkTileGem
     {
         const ck_tile::index_t stride_A = K;
         const ck_tile::index_t stride_B = K;
-        const ck_tile::index_t stride_C = M;
+        const ck_tile::index_t stride_C = N;
 
         // BQuant uses block/grouped quantization for B matrix
         const ck_tile::index_t BQN       = ck_tile::integer_divide_ceil(N, QuantGroupSize::kN);
@@ -629,7 +639,7 @@ class TestCkTileGemmRowColQuant
     {
         const ck_tile::index_t stride_A = K;
         const ck_tile::index_t stride_B = K;
-        const ck_tile::index_t stride_C = M;
+        const ck_tile::index_t stride_C = N;
 
         // RowColQuant uses per-row and per-column scales
         const ck_tile::index_t stride_row_scales = 1;
@@ -846,7 +856,7 @@ class TestCkTileGemmTensorQuant
     {
         const ck_tile::index_t stride_A = K;
         const ck_tile::index_t stride_B = K;
-        const ck_tile::index_t stride_C = M;
+        const ck_tile::index_t stride_C = N;
 
         // TensorQuant uses single scalar scale for each tensor
         const ck_tile::index_t stride_scale_a = 1;
diff --git a/tile_engine/ops/commons/test_benchmark.sh b/tile_engine/ops/commons/test_benchmark.sh
index 1fb7c163af..e2e0324da8 100755
--- a/tile_engine/ops/commons/test_benchmark.sh
+++ b/tile_engine/ops/commons/test_benchmark.sh
@@ -1,4 +1,7 @@
 #!/bin/bash
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 # Test script for tile engine GEMM benchmarks
 # This script demonstrates how to run the new individual benchmark executables
diff --git a/tile_engine/ops/commons/test_validation.py b/tile_engine/ops/commons/test_validation.py
index 79f24265f1..46fb008c27 100644
--- a/tile_engine/ops/commons/test_validation.py
+++ b/tile_engine/ops/commons/test_validation.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 """
 Test script to verify that the validation logic is working correctly.
 """
diff --git a/tile_engine/ops/commons/validation_utils.py b/tile_engine/ops/commons/validation_utils.py
index 3eb7bf8b57..5787446e8c 100644
--- a/tile_engine/ops/commons/validation_utils.py
+++ b/tile_engine/ops/commons/validation_utils.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 """
 Validation utilities for GEMM kernel generation.
diff --git a/tile_engine/ops/gemm/codegen_utils.py b/tile_engine/ops/gemm/codegen_utils.py
index 0020fccf05..eecc2228a6 100644
--- a/tile_engine/ops/gemm/codegen_utils.py
+++ b/tile_engine/ops/gemm/codegen_utils.py
@@ -1,5 +1,5 @@
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 # -*- coding: utf-8 -*-
 
diff --git a/tile_engine/ops/gemm/gemm_benchmark.hpp b/tile_engine/ops/gemm/gemm_benchmark.hpp
index 0e2619785e..7c8df32ad8 100644
--- a/tile_engine/ops/gemm/gemm_benchmark.hpp
+++ b/tile_engine/ops/gemm/gemm_benchmark.hpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/tile_engine/ops/gemm/gemm_benchmark.py b/tile_engine/ops/gemm/gemm_benchmark.py
index 9f323f2640..cc04dbe0db 100755
--- a/tile_engine/ops/gemm/gemm_benchmark.py
+++ b/tile_engine/ops/gemm/gemm_benchmark.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 import sys
 import json
diff --git a/tile_engine/ops/gemm/gemm_benchmark_single.cpp b/tile_engine/ops/gemm/gemm_benchmark_single.cpp
index bbcc6eb505..6323c066a1 100644
--- a/tile_engine/ops/gemm/gemm_benchmark_single.cpp
+++ b/tile_engine/ops/gemm/gemm_benchmark_single.cpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <functional>
diff --git a/tile_engine/ops/gemm/gemm_common.hpp b/tile_engine/ops/gemm/gemm_common.hpp
index 4732f2a1ba..899221547f 100644
--- a/tile_engine/ops/gemm/gemm_common.hpp
+++ b/tile_engine/ops/gemm/gemm_common.hpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/tile_engine/ops/gemm/gemm_instance_builder.py b/tile_engine/ops/gemm/gemm_instance_builder.py
index 1aff42b902..8885c821c1 100644
--- a/tile_engine/ops/gemm/gemm_instance_builder.py
+++ b/tile_engine/ops/gemm/gemm_instance_builder.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 import os
 import json
diff --git a/tile_engine/ops/gemm/gemm_profiler.hpp b/tile_engine/ops/gemm/gemm_profiler.hpp
index 575e5240a8..3c6bbc34d3 100644
--- a/tile_engine/ops/gemm/gemm_profiler.hpp
+++ b/tile_engine/ops/gemm/gemm_profiler.hpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.hpp b/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.hpp
index 53dcdb5e1f..f8c196e32a 100644
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.hpp
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.hpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py b/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py
index fb81b9c2c2..044e08baca 100755
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 import sys
 import json
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark_single.cpp b/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
index 032a625354..41d2f736e1 100644
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_benchmark_single.cpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <functional>
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_common.hpp b/tile_engine/ops/gemm_multi_d/gemm_multi_d_common.hpp
index 4732f2a1ba..899221547f 100644
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_common.hpp
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_common.hpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py b/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
index 3f7858f146..cc167fb75f 100644
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_instance_builder.py
@@ -1,4 +1,7 @@
 #!/usr/bin/env python
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
+
 
 import os
 import json
diff --git a/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp b/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp
index 8e19c11c7d..3a2cdc71fe 100644
--- a/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp
+++ b/tile_engine/ops/gemm_multi_d/gemm_multi_d_profiler.hpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/tile_engine/ops/gemm_preshuffle/commons/validation_utils.py b/tile_engine/ops/gemm_preshuffle/commons/validation_utils.py
index b38ff5dffb..70ce3b0d72 100644
--- a/tile_engine/ops/gemm_preshuffle/commons/validation_utils.py
+++ b/tile_engine/ops/gemm_preshuffle/commons/validation_utils.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
 
 """
 Validation utilities for GEMM kernel generation.
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.hpp b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
index 77a9f26527..748fe581d3 100644
--- a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.hpp
@@ -1,3 +1,6 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
 #pragma once
 
 #include "ck_tile/core.hpp"
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py
index 0217a439f2..d8892be7d6 100755
--- a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark.py
@@ -1,6 +1,6 @@
 #!/usr/bin/env python3
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 # SPDX-License-Identifier: MIT
-# Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 import sys
 import json
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
index 1f03d1cf9b..4fbb25f0c9 100644
--- a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_benchmark_single.cpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include <iostream>
 #include <functional>
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_common.hpp b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_common.hpp
index abaa5ebd46..1b2cfe3735 100644
--- a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_common.hpp
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_common.hpp
@@ -1,5 +1,5 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_instance_builder.py b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_instance_builder.py
index 57c250f57e..9ce6d8cb25 100644
--- a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_instance_builder.py
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_instance_builder.py
@@ -1,5 +1,5 @@
-## Copyright © Advanced Micro Devices, Inc. or its affiliates.
-## SPDX-License-Identifier: MIT
+# Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+# SPDX-License-Identifier: MIT
 
 import argparse
 import os
diff --git a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_profiler.hpp b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_profiler.hpp
index 85b731c231..739bd7e677 100644
--- a/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_profiler.hpp
+++ b/tile_engine/ops/gemm_preshuffle/gemm_preshuffle_profiler.hpp
@@ -1,4 +1,4 @@
-// Copyright © Advanced Micro Devices, Inc. or its affiliates.
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
 // SPDX-License-Identifier: MIT
 
 #pragma once
diff --git a/tutorial/CMakeLists.txt b/tutorial/CMakeLists.txt
new file mode 100644
index 0000000000..a2f35ca53f
--- /dev/null
+++ b/tutorial/CMakeLists.txt
@@ -0,0 +1,15 @@
+include_directories(BEFORE
+    ${PROJECT_SOURCE_DIR}/include
+    ${PROJECT_SOURCE_DIR}/library/include
+)
+
+message(STATUS "Building tutorials...")
+add_custom_target(tutorials)
+
+# add all tutorial subdir
+file(GLOB dir_list LIST_DIRECTORIES true *)
+FOREACH(subdir ${dir_list})
+    if(IS_DIRECTORY "${subdir}" AND EXISTS "${subdir}/CMakeLists.txt")
+        add_subdirectory(${subdir})
+    ENDIF()
+ENDFOREACH()
diff --git a/example/ck_tile/39_copy/CMakeLists.txt b/tutorial/ck_tile/00_copy_kernel/CMakeLists.txt
similarity index 54%
rename from example/ck_tile/39_copy/CMakeLists.txt
rename to tutorial/ck_tile/00_copy_kernel/CMakeLists.txt
index 98397a33d2..91dd036eff 100644
--- a/example/ck_tile/39_copy/CMakeLists.txt
+++ b/tutorial/ck_tile/00_copy_kernel/CMakeLists.txt
@@ -1,7 +1,9 @@
-add_executable(tile_example_copy EXCLUDE_FROM_ALL copy_basic.cpp)
+add_executable(tile_tutorial_copy_kernel EXCLUDE_FROM_ALL copy_basic.cpp)
 
 # Impact: This flag ensures that the compiler doesn't make 
 # assumptions about memory aliasing that could interfere with Composable Kernel's explicit memory access patterns.
-target_compile_options(tile_example_copy PRIVATE
+target_compile_options(tile_tutorial_copy_kernel PRIVATE
   -mllvm -enable-noalias-to-md-conversion=0
 )
+
+add_dependencies(tutorials tile_tutorial_copy_kernel)
diff --git a/example/ck_tile/39_copy/README.md b/tutorial/ck_tile/00_copy_kernel/README.md
similarity index 100%
rename from example/ck_tile/39_copy/README.md
rename to tutorial/ck_tile/00_copy_kernel/README.md
diff --git a/example/ck_tile/39_copy/copy_basic.cpp b/tutorial/ck_tile/00_copy_kernel/copy_basic.cpp
similarity index 86%
rename from example/ck_tile/39_copy/copy_basic.cpp
rename to tutorial/ck_tile/00_copy_kernel/copy_basic.cpp
index de91dc1be9..282e9ff8c1 100644
--- a/example/ck_tile/39_copy/copy_basic.cpp
+++ b/tutorial/ck_tile/00_copy_kernel/copy_basic.cpp
@@ -54,10 +54,10 @@ bool run(const ck_tile::ArgParser& arg_parser)
     x_buf.ToDevice(x_host.data());
 
     // Define tile configuration
-    using ThreadTile = ck_tile::sequence<1, 4>;   // per-thread tile size along M and N
-    using WaveTile   = ck_tile::sequence<64, 4>;  // wave size along M and N dimension
-    using BlockWaves = ck_tile::sequence<4, 1>;   // number of waves along M dimension
-    using BlockTile  = ck_tile::sequence<512, 4>; // block size along M and N dimension
+    using ThreadTile = ck_tile::sequence<1, 4>;  // per-thread tile size along M and N
+    using WaveTile   = ck_tile::sequence<64, 4>; // per-wave tile size along M and N dimension
+    using BlockWaves = ck_tile::sequence<4, 1>; // number of waves per block along M and N dimension
+    using BlockTile  = ck_tile::sequence<512, 4>; // per-block tile size along M and N dimension
 
     // Calculate grid size
     ck_tile::index_t kGridSize =
@@ -68,14 +68,14 @@ bool run(const ck_tile::ArgParser& arg_parser)
     using Shape   = ck_tile::TileCopyShape<BlockWaves, BlockTile, WaveTile, ThreadTile>;
     using Problem = ck_tile::TileCopyProblem<XDataType, Shape>;
     using Policy  = ck_tile::TileCopyPolicy<Problem>;
-    using Kernel  = ck_tile::ElementWiseTileCopyKernel<Problem, Policy>;
-    // using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>;
-    // using Kernel = ck_tile::TileCopyKernel_LDS<Problem, Policy>;
+    using Kernel  = ck_tile::ElementWiseTileCopyKernel<Problem, Policy>; // operates on element by
+                                                                         // element basis.
 
-    // question: Why do we not have a pipeline?
-    // answer: For basic copy operation, pipeline is not needed.
-    // we intentionally do not use pipeline for this example and let the kernel be composite of
-    // Problem and Policy
+    // We also implement two variations of the copy kernel:
+    // 1. TileCopyKernel: This is the basic copy kernel that operates on tile by tile basis.
+    // 2. TileCopyKernel_LDS: This is the copy kernel that operates on tile by tile basis and uses
+    // the LDS. using Kernel  = ck_tile::TileCopyKernel<Problem, Policy>; using Kernel =
+    // ck_tile::TileCopyKernel_LDS<Problem, Policy>;
 
     auto blockSize = Kernel::BlockSize();
 
diff --git a/example/ck_tile/39_copy/copy_basic.hpp b/tutorial/ck_tile/00_copy_kernel/copy_basic.hpp
similarity index 100%
rename from example/ck_tile/39_copy/copy_basic.hpp
rename to tutorial/ck_tile/00_copy_kernel/copy_basic.hpp
diff --git a/example/ck_tile/39_copy/test_tile_example.sh b/tutorial/ck_tile/00_copy_kernel/test_tile_example.sh
similarity index 95%
rename from example/ck_tile/39_copy/test_tile_example.sh
rename to tutorial/ck_tile/00_copy_kernel/test_tile_example.sh
index 416338fac4..4ee5fdf15d 100755
--- a/example/ck_tile/39_copy/test_tile_example.sh
+++ b/tutorial/ck_tile/00_copy_kernel/test_tile_example.sh
@@ -4,7 +4,7 @@
 
 set -euo pipefail
 
-BIN="${BIN:-../../../build/bin/tile_example_copy}"
+BIN="${BIN:-../../../build/bin/tile_tutorial_copy_kernel}"
 WARMUP="${WARMUP:-20}"
 REPEAT="${REPEAT:-100}"
 VALIDATE="${VALIDATE:-1}"
diff --git a/tutorial/ck_tile/01_naive_gemm/BLOCK_LEVEL_PIPELINE.md b/tutorial/ck_tile/01_naive_gemm/BLOCK_LEVEL_PIPELINE.md
new file mode 100644
index 0000000000..114fccfd56
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/BLOCK_LEVEL_PIPELINE.md
@@ -0,0 +1,589 @@
+# Block-Level Pipeline: PracticeGemmBlockPipelineAGmemBGmemCreg
+
+## Overview
+
+The **Block-Level Pipeline** is where the actual GEMM computation happens for one block tile. It orchestrates:
+1. **Data movement** from DRAM → Registers → LDS
+2. **GEMM computation** using data in LDS
+3. **Iteration** over the K dimension when needed
+
+This pipeline is called by the host-level pipeline for each block tile that covers a portion of the output matrix C.
+
+---
+
+## Architecture: Problem and Policy
+
+Like other components in CK Tile, the block pipeline follows the **Problem/Policy** pattern:
+
+### Problem: `PracticeGemmBlockPipelineProblem`
+Contains:
+- **Data types**: `ADataType`, `BDataType`, `CDataType`, `AccDataType`
+- **Shape information**: `BlockTile` and `WaveTile` dimensions
+
+### Policy: `PracticeGemmBlockPolicy`
+Contains strategies for:
+1. **Tile Distribution** (`MakeADramTileDistribution`, `MakeBDramTileDistribution`)
+   - Defines how 256 threads in a block map to elements of a block tile
+   - Each thread knows which elements to load/store from DRAM to its registers
+   - We'll cover tile distribution construction in detail later
+
+2. **LDS Layout** (`MakeALdsBlockDescriptor`, `MakeBLdsBlockDescriptor`)
+   - Describes how data is logically organized in Local Data Share (LDS)
+   - Optimizes for bank conflict avoidance and efficient access patterns
+   - We'll cover LDS descriptor construction in detail later
+
+3. **Warp Pipeline** (`GetPracticeWaveGemmPipeline`)
+   - Returns the warp-level GEMM implementation
+
+---
+
+## Inputs and Outputs
+
+```cpp
+template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+CK_TILE_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                index_t num_loop,
+                                void* p_smem) const
+```
+
+### Inputs:
+- `a_dram_block_window_tmp`: Tile window over A in DRAM (size: MPerBlock × KPerBlock)
+- `b_dram_block_window_tmp`: Tile window over B in DRAM (size: NPerBlock × KPerBlock)
+- `num_loop`: Number of iterations along K dimension
+- `p_smem`: Pointer to shared memory (LDS)
+
+### Output:
+- `c_block_tile`: A `static_distributed_tensor` containing the computed C tile in registers (VGPRs)
+
+---
+
+## Step-by-Step Walkthrough
+
+### Step 1: Create LDS Tensor Views
+
+```cpp
+// A tile in LDS
+ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
+constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
+auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+// B tile in LDS (placed after A in shared memory)
+BDataType* p_b_lds = static_cast<BDataType*>(
+    static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
+constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
+auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
+```
+
+**What's happening:**
+- We partition the shared memory (`p_smem`) into two regions: one for A, one for B
+- We create **tensor views** over these LDS regions using descriptors from the policy
+- `a_lds_block` and `b_lds_block` are logical views over raw LDS memory
+
+**Memory Layout:**
+```
+Shared Memory (LDS):
+┌─────────────────────┬─────────────────────┐
+│   A Block Tile      │   B Block Tile      │
+│   (256×32 fp16)     │   (128×32 fp16)     │
+└─────────────────────┴─────────────────────┘
+↑                     ↑
+p_a_lds               p_b_lds
+```
+
+---
+
+### Step 2: Create Tile Windows for Data Movement
+
+We create **6 tile windows** for different purposes:
+
+#### 2a. DRAM → Registers (Load from DRAM)
+
+```cpp
+auto a_copy_dram_window = make_tile_window(
+    a_dram_block_window_tmp.get_bottom_tensor_view(),
+    make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),  // 256×32
+    a_dram_block_window_tmp.get_window_origin(),
+    Policy::template MakeADramTileDistribution<Problem>());  // ← Tile distribution!
+```
+
+**Key Points:**
+- `a_copy_dram_window` is a `tile_window_with_static_distribution`
+- The **tile distribution** tells each thread which elements to load from DRAM
+- This window will **slide along the K dimension** in the loop
+
+#### 2b. Registers → LDS (Store to LDS)
+
+```cpp
+auto a_copy_lds_window = make_tile_window(
+    a_lds_block,
+    make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),  // 256×32
+    {0, 0},  // Origin at (0, 0) in LDS
+    a_copy_dram_window.get_tile_distribution());  // ← Same distribution as DRAM!
+```
+
+**Key Points:**
+- Uses the **same tile distribution** as `a_copy_dram_window`
+- This ensures each thread stores to LDS in the same pattern it loaded from DRAM
+- Origin is always `{0, 0}` because LDS is reused for each K iteration
+
+#### 2c. LDS → Registers (GEMM Input)
+
+```cpp
+auto a_lds_gemm_window = make_tile_window(
+    a_lds_block,
+    make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+    {0, 0});  // No tile distribution!
+```
+
+**Key Points:**
+- This is a `tile_window_with_static_lengths` (no explicit distribution)
+- Used as input to the warp-level GEMM
+- The warp GEMM will handle its own thread mapping internally
+
+**Similar windows are created for B:**
+- `b_copy_dram_window`: Load B from DRAM
+- `b_copy_lds_window`: Store B to LDS
+- `b_lds_gemm_window`: Read B from LDS for GEMM
+
+---
+
+### Step 3: Create Distributed Tensors (VGPRs)
+
+```cpp
+using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
+using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+
+using ABlockTile = decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
+using BBlockTile = decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
+
+ABlockTile a_block_tile;  // Per-thread registers for A
+BBlockTile b_block_tile;  // Per-thread registers for B
+```
+
+#### What is `make_static_distributed_tensor`?
+
+**`make_static_distributed_tensor`** creates a **`static_distributed_tensor`**, which is a compile-time abstraction for **distributed per-thread register storage**.
+
+**Key Properties:**
+1. **Per-thread VGPRs**: Each thread owns a **different slice** of the tile in its registers
+2. **Compile-time sized**: Buffer size determined by tile distribution at compile time
+3. **Zero-overhead**: All indexing and layout transformations happen at compile time
+
+**How it works:**
+
+```cpp
+template <typename DataType_, typename StaticTileDistribution_>
+struct static_distributed_tensor
+{
+    using DataType = remove_cvref_t<DataType_>;
+    using StaticTileDistribution = remove_cvref_t<StaticTileDistribution_>;
+    
+    // Calculate per-thread storage size from tile distribution
+    using ThreadTensorDesc = 
+        remove_cvref_t<decltype(StaticTileDistribution{}.get_ys_to_d_descriptor())>;
+    
+    static constexpr index_t kThreadElementSpaceSize = 
+        ThreadTensorDesc{}.get_element_space_size();
+    
+    // Per-thread register array (VGPRs)
+    thread_buffer<DataType, get_thread_buffer_size()> thread_buf_;
+};
+```
+
+**The tile distribution defines:**
+- **Which elements each thread owns** in the tile
+- **How many elements** each thread stores (buffer size)
+- **How elements are laid out** in each thread's registers
+
+**Concrete Example for 256×32 tile with 256 threads:**
+
+```
+Thread 0:  a_block_tile.thread_buf_ = [A[0,0], A[0,1], ..., A[0,31]]   (32 fp16 values)
+Thread 1:  a_block_tile.thread_buf_ = [A[1,0], A[1,1], ..., A[1,31]]   (32 fp16 values)
+Thread 2:  a_block_tile.thread_buf_ = [A[2,0], A[2,1], ..., A[2,31]]   (32 fp16 values)
+...
+Thread 255: a_block_tile.thread_buf_ = [A[255,0], A[255,1], ..., A[255,31]] (32 fp16 values)
+```
+
+**Collectively:**
+- All 256 threads together hold the **entire 256×32 tile** (8192 elements)
+- Each thread's buffer lives in its **own VGPRs**
+- No two threads own the same element
+
+**Distributed Ownership Analogy:**
+Think of a tile as a **jigsaw puzzle**:
+- The **tile distribution** is the cutting pattern
+- Each **thread** gets one puzzle piece (its slice)
+- Each **`static_distributed_tensor`** is a box holding all pieces
+- Each thread's **`thread_buf_`** is its individual piece in its own registers
+
+---
+
+### Step 4: The GEMM Loop
+
+```cpp
+// Initialize C accumulator to zero
+auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){};
+tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+
+index_t iCounter = num_loop;  // Number of K iterations
+
+while(iCounter > 0)
+{
+    // 1. Load from DRAM to registers
+    a_block_tile = load_tile(a_copy_dram_window);  // DRAM → VGPRs
+    b_block_tile = load_tile(b_copy_dram_window);  // DRAM → VGPRs
+    
+    // 2. Move windows for next iteration
+    move_tile_window(a_copy_dram_window, a_dram_tile_window_step);  // Step by (0, 32)
+    move_tile_window(b_copy_dram_window, b_dram_tile_window_step);  // Step by (0, 32)
+    
+    // 3. Store from registers to LDS
+    store_tile(a_copy_lds_window, a_block_tile);  // VGPRs → LDS
+    store_tile(b_copy_lds_window, b_block_tile);  // VGPRs → LDS
+    
+    // 4. Synchronize threads (ensure all data is in LDS)
+    block_sync_lds();
+    
+    // 5. Compute GEMM using data in LDS
+    block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+    
+    // 6. Synchronize threads (before overwriting LDS in next iteration)
+    block_sync_lds();
+    
+    iCounter--;
+}
+
+return c_block_tile;  // Return accumulated result in registers
+```
+
+---
+
+## Detailed Loop Breakdown
+
+### Phase 1: Load (DRAM → VGPRs)
+
+```cpp
+a_block_tile = load_tile(a_copy_dram_window);
+```
+
+**What happens:**
+1. Each thread reads **its assigned elements** from DRAM (determined by tile distribution)
+2. Data is loaded into **per-thread registers** (VGPRs)
+3. Uses **vectorized loads** for efficiency (e.g., loading 8 fp16 values at once)
+
+**Example for Thread 0:**
+```
+Thread 0 loads:
+  A[0,0:7]   (8 fp16 values, one vector load)
+  A[1,0:7]   (8 fp16 values, one vector load)
+  ...
+```
+
+### Phase 2: Move Windows
+
+```cpp
+constexpr ADramTileWindowStep a_dram_tile_window_step = make_array(0, KPerBlock);
+move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+```
+
+**What happens:**
+- The tile window **slides along the K dimension** by `KPerBlock` (32 in our example)
+- This prepares for the next K iteration
+- The window origin moves from `(0, 0)` → `(0, 32)` → `(0, 64)` → ...
+
+**Visualization for Problem Size 512×256×64:**
+```
+Matrix A (512×64):
+┌─────────────────────────────────────┐
+│ Block 0: rows 0-255                 │
+│ ┌──────────┬──────────┐             │
+│ │ K=0:31   │ K=32:63  │             │  ← Window slides right
+│ │ Iter 0   │ Iter 1   │             │
+│ └──────────┴──────────┘             │
+└─────────────────────────────────────┘
+```
+
+### Phase 3: Store (VGPRs → LDS)
+
+```cpp
+store_tile(a_copy_lds_window, a_block_tile);
+```
+
+**What happens:**
+1. Each thread writes **its elements** from registers to LDS
+2. Uses the **same distribution** as the DRAM load
+3. Data is now in **shared memory**, accessible to all threads in the block
+
+**Why this step?**
+- GEMM computation needs **all threads** to access **all data**
+- Registers are per-thread; LDS is shared across the block
+- LDS acts as a "staging area" for collaborative computation
+
+### Phase 4: Synchronize
+
+```cpp
+block_sync_lds();
+```
+
+**What happens:**
+- All threads in the block **wait** until everyone has finished storing to LDS
+- Ensures no thread starts reading from LDS before all writes are complete
+- Critical for correctness!
+
+### Phase 5: GEMM Computation
+
+```cpp
+block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window);
+```
+
+**What happens:**
+1. The warp-level GEMM reads data from LDS
+2. Performs matrix multiplication using MFMA instructions
+3. Accumulates results into `c_block_tile` (in registers)
+
+**Note:** `c_block_tile` stays in registers throughout all K iterations, accumulating results.
+
+### Phase 6: Synchronize Again
+
+```cpp
+block_sync_lds();
+```
+
+**What happens:**
+- Ensures all threads have finished reading from LDS
+- Safe to overwrite LDS in the next iteration
+
+---
+
+## Memory Flow Diagram
+
+```
+Iteration 0 (K=0:31):
+┌─────────┐   load_tile   ┌──────────┐   store_tile   ┌─────────┐
+│  DRAM   │ ────────────> │  VGPRs   │ ─────────────> │   LDS   │
+│ A[0:255,│               │ (per-    │                │ A_block │
+│   0:31] │               │  thread) │                │         │
+└─────────┘               └──────────┘                └─────────┘
+                                                            │
+                                                            │ block_gemm
+                                                            ↓
+                                                      ┌──────────┐
+                                                      │ c_block_ │
+                                                      │   tile   │
+                                                      │ (VGPRs)  │
+                                                      └──────────┘
+
+Iteration 1 (K=32:63):
+┌─────────┐   load_tile   ┌──────────┐   store_tile   ┌─────────┐
+│  DRAM   │ ────────────> │  VGPRs   │ ─────────────> │   LDS   │
+│ A[0:255,│               │ (per-    │                │ A_block │
+│  32:63] │               │  thread) │                │ (reused)│
+└─────────┘               └──────────┘                └─────────┘
+                                                            │
+                                                            │ block_gemm
+                                                            ↓
+                                                      ┌──────────┐
+                                                      │ c_block_ │
+                                                      │   tile   │
+                                                      │ (accum.) │
+                                                      └──────────┘
+```
+
+---
+
+## Example: Problem Size 512×256×64
+
+### Block 0 Computation
+
+**Input:**
+- `a_dram_block_window_tmp`: Covers A[0:255, 0:31] initially
+- `b_dram_block_window_tmp`: Covers B[0:127, 0:31] initially (B is transposed)
+- `num_loop`: 2 (since K=64, KPerBlock=32)
+
+**Iteration 0:**
+1. Load A[0:255, 0:31] and B[0:127, 0:31] from DRAM to VGPRs
+2. Move windows: A → [0:255, 32:63], B → [0:127, 32:63]
+3. Store to LDS
+4. Compute: `C[0:255, 0:127] += A[0:255, 0:31] × B[0:127, 0:31]^T`
+
+**Iteration 1:**
+1. Load A[0:255, 32:63] and B[0:127, 32:63] from DRAM to VGPRs
+2. Move windows: A → [0:255, 64:95], B → [0:127, 64:95] (out of bounds, but loop ends)
+3. Store to LDS
+4. Compute: `C[0:255, 0:127] += A[0:255, 32:63] × B[0:127, 32:63]^T`
+
+**Output:**
+- `c_block_tile`: Contains C[0:255, 0:127] in distributed registers
+
+---
+
+## Key Concepts Summary
+
+### 1. Tile Distribution
+- **Maps threads to data elements** for load/store operations
+- Each thread knows exactly which elements it's responsible for
+- Enables **parallel, vectorized** memory access
+- **Same distribution** used for DRAM load and LDS store
+
+### 2. Static Distributed Tensor
+- **Per-thread register storage** (VGPRs)
+- Each thread owns a **different slice** of the tile
+- **Compile-time sized** for zero-overhead abstraction
+- Used for: `a_block_tile`, `b_block_tile`, `c_block_tile`
+
+### 3. Tile Window Movement
+- Windows **slide** over larger tensors
+- Enables iteration over the K dimension
+- `move_tile_window(window, step)` updates the origin
+
+### 4. LDS as Staging Area
+- **Shared memory** accessible to all threads in a block
+- Required because GEMM needs all threads to access all data
+- **Reused** across K iterations (same LDS buffer)
+
+### 5. Synchronization
+- `block_sync_lds()` ensures memory consistency
+- **Before GEMM**: All stores to LDS are complete
+- **After GEMM**: All reads from LDS are complete
+
+---
+
+## Deep Dive: `static_distributed_tensor` Mechanics
+
+### How Tile Distribution Creates Per-Thread Storage
+
+When you call:
+```cpp
+using ABlockTile = decltype(make_static_distributed_tensor<fp16_t>(ABlockTileDistr{}));
+ABlockTile a_block_tile;
+```
+
+**Step 1: Extract Thread Tensor Descriptor**
+
+The tile distribution contains a `ys_to_d_descriptor` that maps:
+- **Y dimensions** (logical tile coordinates, e.g., M, K)
+- **D dimension** (per-thread register index, linearized)
+
+```cpp
+using ThreadTensorDesc = 
+    decltype(StaticTileDistribution{}.get_ys_to_d_descriptor());
+```
+
+**Step 2: Calculate Per-Thread Buffer Size**
+
+```cpp
+static constexpr index_t kThreadElementSpaceSize = 
+    ThreadTensorDesc{}.get_element_space_size();
+
+static constexpr index_t get_thread_buffer_size()
+{
+    return kThreadElementSpaceSize / PackedSize;
+}
+```
+
+**Example:**
+- 256×32 tile distributed across 256 threads
+- Each thread owns 32 elements (one row)
+- `thread_buffer_size = 32` (for PackedSize=1)
+
+**Step 3: Allocate Thread Buffer**
+
+```cpp
+thread_buffer<DataType, get_thread_buffer_size()> thread_buf_;
+```
+
+This is essentially:
+```cpp
+fp16_t data[32];  // Per-thread register array (VGPRs)
+```
+
+### Usage in Load/Store Operations
+
+**Load from DRAM:**
+```cpp
+a_block_tile = load_tile(a_copy_dram_window);
+```
+
+What happens internally:
+1. Each thread queries the tile distribution: "Which elements do I own?"
+2. Thread 0 learns it owns A[0,0:31]
+3. Thread 0 loads those elements from DRAM into `a_block_tile.thread_buf_[0:31]`
+4. All 256 threads do this **in parallel**
+
+**Store to LDS:**
+```cpp
+store_tile(a_copy_lds_window, a_block_tile);
+```
+
+What happens internally:
+1. Each thread reads from its `a_block_tile.thread_buf_`
+2. Thread 0 writes A[0,0:31] from its registers to LDS
+3. All 256 threads do this **in parallel**
+4. After `block_sync_lds()`, the entire tile is in shared LDS
+
+### Distributed Indexing
+
+The `static_distributed_tensor` supports compile-time indexing:
+
+```cpp
+// Access using distributed indices
+auto value = a_block_tile(tile_distributed_index<i, j>{});
+```
+
+Internally:
+1. Convert distributed index → Y index (logical tile coordinates)
+2. Calculate buffer offset using `ThreadTensorDesc`
+3. Access `thread_buf_[offset]`
+
+All of this happens **at compile time** with zero runtime overhead!
+
+### Why This Design?
+
+**Benefits:**
+1. **Parallel Memory Access**: All threads load/store simultaneously
+2. **Vectorization**: Each thread can use vector loads (e.g., 8×fp16 at once)
+3. **Zero Overhead**: All indexing resolved at compile time
+4. **Type Safety**: Distribution mismatch caught at compile time
+5. **Register Pressure**: Compiler knows exact VGPR usage
+
+**Trade-offs:**
+- Requires compile-time tile sizes
+- Distribution must be static
+- More complex type system
+
+### Memory Hierarchy Summary
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                         DRAM (Global Memory)                 │
+│                    Full matrices A, B, C                     │
+└─────────────────────────────────────────────────────────────┘
+                              │
+                              │ load_tile (parallel, vectorized)
+                              ↓
+┌─────────────────────────────────────────────────────────────┐
+│                    VGPRs (Per-Thread Registers)              │
+│  Thread 0: a_block_tile.thread_buf_ = [A[0,0:31]]          │
+│  Thread 1: a_block_tile.thread_buf_ = [A[1,0:31]]          │
+│  ...                                                         │
+│  Thread 255: a_block_tile.thread_buf_ = [A[255,0:31]]      │
+│                                                              │
+│  ← static_distributed_tensor manages this distribution      │
+└─────────────────────────────────────────────────────────────┘
+                              │
+                              │ store_tile (parallel, vectorized)
+                              ↓
+┌─────────────────────────────────────────────────────────────┐
+│                    LDS (Shared Memory)                       │
+│              Entire block tile (256×32)                      │
+│           Accessible to all threads in block                 │
+└─────────────────────────────────────────────────────────────┘
+```
+
+**Key Insight:**
+`static_distributed_tensor` is the abstraction that enables efficient, parallel data movement between DRAM and LDS through per-thread VGPRs, with all coordination happening at compile time.
+
+
+
diff --git a/tutorial/ck_tile/01_naive_gemm/CMakeLists.txt b/tutorial/ck_tile/01_naive_gemm/CMakeLists.txt
new file mode 100644
index 0000000000..e16977921a
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/CMakeLists.txt
@@ -0,0 +1,7 @@
+add_executable(tile_tutorial_naive_gemm EXCLUDE_FROM_ALL practice_gemm.cpp)
+
+target_compile_options(tile_tutorial_naive_gemm PRIVATE
+  -mllvm -enable-noalias-to-md-conversion=0
+)
+
+add_dependencies(tutorials tile_tutorial_naive_gemm)
\ No newline at end of file
diff --git a/tutorial/ck_tile/01_naive_gemm/HOST_LEVEL_PIPELINE.md b/tutorial/ck_tile/01_naive_gemm/HOST_LEVEL_PIPELINE.md
new file mode 100644
index 0000000000..43cb01fb36
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/HOST_LEVEL_PIPELINE.md
@@ -0,0 +1,618 @@
+# Host-Level Pipeline: Orchestrating Block-Level GEMM
+
+This document explains the **host-level pipeline** (`PracticeGemmHostPipeline`), which orchestrates the distribution of work across thread blocks and manages the high-level flow of the GEMM computation.
+
+## Overview
+
+The host-level pipeline is responsible for:
+1. **Calculating tile coverage**: How many tiles are needed to cover matrices A, B, and C
+2. **Block-to-tile mapping**: Assigning each thread block to a specific tile
+3. **Creating tile windows**: Establishing sliding windows over tensor views
+4. **Delegating computation**: Calling the block-level pipeline to perform actual GEMM
+5. **Storing results**: Writing computed tiles from registers (VGPRs) back to DRAM
+
+```cpp
+template <typename Problem_, typename Policy_ = PracticeGemmHostPolicy>
+struct PracticeGemmHostPipeline
+{
+    template <typename ADRAMTensorView, typename BDRAMTensorView, typename CDRAMTensorView>
+    CK_TILE_DEVICE void operator()(const ADRAMTensorView& a_dram,
+                                   const BDRAMTensorView& b_dram,
+                                   CDRAMTensorView& c_dram) const
+    {
+        // 1. Calculate problem dimensions and tile coverage
+        // 2. Map thread block to tile coordinates
+        // 3. Create tile windows over A and B
+        // 4. Call block-level pipeline to compute
+        // 5. Store result to C
+    }
+};
+```
+
+---
+
+## Step 1: Calculate Problem Dimensions and Tile Coverage
+
+```cpp
+// Size of the entire problem
+const auto M = a_dram.get_tensor_descriptor().get_length(number<0>{}); // M x K
+const auto N = c_dram.get_tensor_descriptor().get_length(number<1>{}); // M x N
+const auto K = a_dram.get_tensor_descriptor().get_length(number<1>{}); // M x K
+
+// Size of the block tile
+const auto MPerBlock = BlockTile::at(number<0>{});  // 256
+const auto NPerBlock = BlockTile::at(number<1>{});  // 128
+const auto KPerBlock = BlockTile::at(number<2>{});  // 32
+
+// Number of block tiles needed to cover C matrix
+const auto num_tile_n = integer_divide_ceil(N, NPerBlock);  // ceil(256/128) = 2
+const auto num_tile_m = integer_divide_ceil(M, MPerBlock);  // ceil(512/256) = 2
+```
+
+### What's Happening:
+
+1. **Extract problem dimensions** from tensor descriptors:
+   - `M = 512`: Rows in A and C
+   - `N = 256`: Columns in B and C
+   - `K = 64`: Inner dimension (columns of A, rows of B)
+
+2. **Get block tile sizes** from the `BlockTile` configuration:
+   - `MPerBlock = 256`: Each block processes 256 rows
+   - `NPerBlock = 128`: Each block processes 128 columns
+   - `KPerBlock = 32`: Each block processes 32 elements in K dimension per iteration
+
+3. **Calculate tile coverage**:
+   - `num_tile_m = ceil(M / MPerBlock) = ceil(512/256) = 2` tiles in M direction
+   - `num_tile_n = ceil(N / NPerBlock) = ceil(256/128) = 2` tiles in N direction
+   - **Total tiles = 2 × 2 = 4 tiles** → We need **4 thread blocks**!
+
+### Visual Representation:
+
+```
+Matrix C (512 × 256):
+┌──────────────────────┬──────────────────────┐
+│   Tile (0,0)         │   Tile (0,1)         │  ← num_tile_n = 2
+│   256×128            │   256×128            │
+│   Block 0            │   Block 1            │
+│                      │                      │
+├──────────────────────┼──────────────────────┤
+│   Tile (1,0)         │   Tile (1,1)         │
+│   256×128            │   256×128            │
+│   Block 2            │   Block 3            │
+│                      │                      │
+└──────────────────────┴──────────────────────┘
+         ↑
+    num_tile_m = 2
+
+Total blocks needed = 2 × 2 = 4 blocks
+
+Each block computes one 256×128 tile of the output matrix C.
+```
+
+### How Blocks Cover Matrices A and B:
+
+```
+Matrix A (512 × 64):                Matrix B (256 × 64):
+┌─────────────┬──────┐             ┌─────────────┬──────┐
+│ Block 0,2   │  K   │             │ Block 0,1   │  K   │
+│ uses rows   │  →   │             │ uses rows   │  →   │
+│ 0-255       │      │             │ 0-127       │      │
+├─────────────┼──────┤             ├─────────────┼──────┤
+│ Block 1,3   │  K   │             │ Block 2,3   │  K   │
+│ uses rows   │  →   │             │ uses rows   │  →   │
+│ 256-511     │      │             │ 128-255     │      │
+└─────────────┴──────┘             └─────────────┴──────┘
+   256 rows    64 cols                128 rows    64 cols
+   
+Each block needs to iterate over K dimension (64/32 = 2 iterations)
+```
+
+---
+
+## Step 2: Map Thread Block to Tile Coordinates
+
+```cpp
+// Get block id (0 to total_blocks - 1)
+const auto id_block = get_block_id();
+
+// Map block id to 2D tile coordinates
+const auto block2tile = Policy::MakeBlock2TileMap(num_tile_m, num_tile_n);
+const auto tile_id = block2tile(id_block);
+
+const auto tile_id_m = tile_id.at(number<0>{});  // M coordinate
+const auto tile_id_n = tile_id.at(number<1>{});  // N coordinate
+```
+
+### What's Happening:
+
+Each thread block needs to know **which tile of the output matrix C it should compute**. The `MakeBlock2TileMap` function creates a mapping from linear block ID to 2D tile coordinates.
+
+### The `MakeBlock2TileMap` Function:
+
+```cpp
+CK_TILE_HOST_DEVICE static constexpr auto MakeBlock2TileMap(index_t M0, index_t N0)
+{
+    // Create a merge transform: (N0, M0) → linear index
+    const auto unmerge = make_merge_transform(make_tuple(N0, M0));
+
+    return [unmerge](index_t block_id) {
+        multi_index<2> unmerged;
+        // Convert linear block_id back to 2D coordinates
+        unmerge.calculate_lower_index(unmerged, make_multi_index(block_id));
+
+        // Return (m_idx, n_idx) - note the swap!
+        return make_multi_index(unmerged.at(number<1>{}), unmerged.at(number<0>{}));
+    };
+}
+```
+
+### In Our Example (2×2 Grid):
+
+```cpp
+// Block 0:
+id_block = 0
+tile_id = block2tile(0) = (0, 0)  // Top-left tile
+tile_id_m = 0, tile_id_n = 0
+
+// Block 1:
+id_block = 1
+tile_id = block2tile(1) = (1, 0)  // Bottom-left tile
+tile_id_m = 1, tile_id_n = 0
+
+// Block 2:
+id_block = 2
+tile_id = block2tile(2) = (0, 1)  // Top-right tile
+tile_id_m = 0, tile_id_n = 1
+
+// Block 3:
+id_block = 3
+tile_id = block2tile(3) = (1, 1)  // Bottom-right tile
+tile_id_m = 1, tile_id_n = 1
+```
+
+**Key Point**: Each of the 4 blocks knows exactly which 256×128 tile of C it's responsible for computing!
+
+---
+
+## Step 3: Calculate Tile Origin and Create Tile Windows
+
+```cpp
+// Calculate the starting position of this tile in the global matrix
+const auto tile_origin_m = tile_id_m * MPerBlock;  // e.g., Block 1: 1 * 256 = 256
+const auto tile_origin_n = tile_id_n * NPerBlock;  // e.g., Block 2: 1 * 128 = 128
+
+// Create tile windows over A and B tensor views
+const auto a_block_window = make_tile_window(
+    a_dram,                                      // Tensor view over A
+    make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),  // Window size: 256×32
+    {tile_origin_m, 0}                          // Origin: varies by block
+);
+
+const auto b_block_window = make_tile_window(
+    b_dram,                                      // Tensor view over B
+    make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),  // Window size: 128×32
+    {tile_origin_n, 0}                          // Origin: varies by block
+);
+```
+
+### Tile Origins for Each Block:
+
+```cpp
+// Block 0 (Tile 0,0):
+tile_origin_m = 0 * 256 = 0
+tile_origin_n = 0 * 128 = 0
+a_block_window origin: (0, 0)    → covers A rows 0-255
+b_block_window origin: (0, 0)    → covers B rows 0-127
+
+// Block 1 (Tile 1,0):
+tile_origin_m = 1 * 256 = 256
+tile_origin_n = 0 * 128 = 0
+a_block_window origin: (256, 0)  → covers A rows 256-511
+b_block_window origin: (0, 0)    → covers B rows 0-127
+
+// Block 2 (Tile 0,1):
+tile_origin_m = 0 * 256 = 0
+tile_origin_n = 1 * 128 = 128
+a_block_window origin: (0, 0)    → covers A rows 0-255
+b_block_window origin: (128, 0)  → covers B rows 128-255
+
+// Block 3 (Tile 1,1):
+tile_origin_m = 1 * 256 = 256
+tile_origin_n = 1 * 128 = 128
+a_block_window origin: (256, 0)  → covers A rows 256-511
+b_block_window origin: (128, 0)  → covers B rows 128-255
+```
+
+### What are Tile Windows?
+
+A **tile window** is a **sliding window** over a larger tensor view. It:
+- Defines a **rectangular region** within the tensor
+- Has a **fixed size** (e.g., 256×32 for A)
+- Has an **origin** (starting position)
+- Can be **moved** to access different regions
+### Visual Representation (Block 0 Example):
+
+```
+Matrix A (512 × 64):                    Matrix B (256 × 64):
+┌─────────────┬─────────────┐          ┌─────────────┬─────────────┐
+│ ┏━━━━━━━━━┓ │             │          │ ┏━━━━━━━━━┓ │             │
+│ ┃ Window  ┃ │             │          │ ┃ Window  ┃ │             │
+│ ┃ 256×32  ┃ │             │          │ ┃ 128×32  ┃ │             │
+│ ┃ K=0-31  ┃ │             │          │ ┃ K=0-31  ┃ │             │
+│ ┗━━━━━━━━━┛ │             │          │ ┗━━━━━━━━━┛ │             │
+│             │             │          ├─────────────┼─────────────┤
+├─────────────┼─────────────┤          │             │             │
+│             │             │          │             │             │
+│             │             │          │             │             │
+│             │             │          │             │             │
+└─────────────┴─────────────┘          └─────────────┴─────────────┘
+  Origin: (0, 0)                         Origin: (0, 0)
+  Covers rows 0-255                      Covers rows 0-127
+  Covers cols 0-31 (first K iteration)   Covers cols 0-31 (first K iteration)
+```
+
+**Note**: The window initially covers K columns 0-31. It will move to cover K columns 32-63 in the next iteration.
+
+### Tile Window Properties:
+
+```cpp
+// Tile window structure (conceptual):
+struct tile_window {
+    TensorView& tensor_view;     // Reference to underlying tensor
+    Tuple window_lengths;         // Size of the window (256, 32)
+    MultiIndex window_origin;     // Starting position (0, 0)
+    
+    // Can move the window:
+    void move(MultiIndex step);   // Shift window by step
+    
+    // Access data through the window:
+    auto load();                  // Load data from windowed region
+};
+```
+
+
+### Tile Window Movement: Iterating Over K Dimension
+
+In our example, **K=64** but **KPerBlock=32**, so we need **2 iterations** over the K dimension:
+
+```
+Matrix A (512 × 64) - Block 0's view:
+┌─────────────┬─────────────┐
+│ ┏━━━━━━━━━┓ │ ╔═══════════╗ │
+│ ┃ Iter 0  ┃ │ ║  Iter 1   ║ │  ← Window slides along K
+│ ┃ 256×32  ┃ │ ║  256×32   ║ │
+│ ┃ K=0-31  ┃ │ ║  K=32-63  ║ │
+│ ┗━━━━━━━━━┛ │ ╚═══════════╝ │
+├─────────────┼─────────────┤
+│             │             │
+│  Block 1's  │             │
+│  region     │             │
+└─────────────┴─────────────┘
+
+Matrix B (256 × 64) - Block 0's view:
+┌─────────────┬─────────────┐
+│ ┏━━━━━━━━━┓ │ ╔═══════════╗ │
+│ ┃ Iter 0  ┃ │ ║  Iter 1   ║ │
+│ ┃ 128×32  ┃ │ ║  128×32   ║ │
+│ ┃ K=0-31  ┃ │ ║  K=32-63  ║ │
+│ ┗━━━━━━━━━┛ │ ╚═══════════╝ │
+├─────────────┼─────────────┤
+│  Block 2's  │             │
+│  region     │             │
+└─────────────┴─────────────┘
+```
+
+### How Windows Move (Conceptual - handled by block pipeline):
+
+```cpp
+// Iteration 0:
+a_block_window origin: (tile_origin_m, 0)     // K columns 0-31
+b_block_window origin: (tile_origin_n, 0)     // K columns 0-31
+// Compute: C_partial_0 = A[:, 0:31] × B[:, 0:31]
+
+// Move windows to next K position:
+move_tile_window(a_block_window, {0, 32});
+move_tile_window(b_block_window, {0, 32});
+
+// Iteration 1:
+a_block_window origin: (tile_origin_m, 32)    // K columns 32-63
+b_block_window origin: (tile_origin_n, 32)    // K columns 32-63
+// Compute: C_partial_1 = A[:, 32:63] × B[:, 32:63]
+
+// Final result:
+// C_tile = C_partial_0 + C_partial_1
+```
+
+**Key Insight**: The tile windows **slide along the K dimension** to cover the full inner product. Each block accumulates partial results across K iterations to compute its final tile of C.
+
+---
+
+## Step 4: Delegate to Block-Level Pipeline
+
+```cpp
+// Get the block-level pipeline from policy
+constexpr auto block_gemm_pipeline =
+    Policy::template GetPracticeGemmBlockPipeline<Problem>();
+
+// Calculate number of K iterations needed
+int num_loops_k = integer_divide_ceil(K, KPerBlock);  // ceil(64/32) = 2
+
+// Allocate shared memory (LDS) for block-level computation
+__shared__ char p_smem_char[block_gemm_pipeline.GetStaticLDSSize()];
+
+// Call block-level pipeline to compute C tile
+const auto c_block_tile =
+    block_gemm_pipeline(a_block_window, b_block_window, num_loops_k, p_smem_char);
+```
+
+### What's Happening:
+
+1. **Retrieve block pipeline**: The policy provides the block-level GEMM implementation
+2. **Calculate K iterations**: How many times to iterate over the K dimension
+   - In our example: `K=64, KPerBlock=32` → **2 iterations**
+   - Each iteration processes 32 elements of the K dimension
+   - Results are accumulated across iterations
+
+3. **Allocate shared memory**: 
+   - `__shared__` declares memory shared by all threads in the block
+   - `GetStaticLDSSize()` returns the required size in bytes
+   - This memory is used for:
+     - Staging data from DRAM → LDS
+     - Cooperative loading by threads
+     - Fast access during computation
+
+4. **Execute block pipeline**:
+   - Takes A and B tile windows as input
+   - Performs the GEMM computation: `C_tile = A_tile × B_tile`
+   - Returns result in `c_block_tile` (stored in VGPRs - registers)
+
+### Memory Hierarchy During Computation:
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ DRAM (Global Memory) - Slowest, Largest                     │
+│ ┌─────────────┐  ┌─────────────┐  ┌─────────────┐         │
+│ │   A matrix  │  │   B matrix  │  │   C matrix  │         │
+│ └─────────────┘  └─────────────┘  └─────────────┘         │
+└─────────────────────────────────────────────────────────────┘
+         ↓ load                ↓ load              ↑ store
+┌─────────────────────────────────────────────────────────────┐
+│ LDS (Shared Memory) - Fast, Limited Size (~64KB)           │
+│ ┌─────────────┐  ┌─────────────┐                           │
+│ │  A_tile     │  │  B_tile     │  ← Staged here            │
+│ │  (p_smem)   │  │  (p_smem)   │                           │
+│ └─────────────┘  └─────────────┘                           │
+└─────────────────────────────────────────────────────────────┘
+         ↓ load                ↓ load
+┌─────────────────────────────────────────────────────────────┐
+│ VGPRs (Registers) - Fastest, Smallest (~256 regs/thread)   │
+│ ┌─────────────────────────────────────────────────────────┐ │
+│ │  c_block_tile (accumulated result)                      │ │
+│ │  Computation happens here using MFMA instructions       │ │
+│ └─────────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────┘
+```
+
+### Block Pipeline Responsibilities:
+
+The block pipeline (called here) will:
+1. Load A and B tiles from DRAM → LDS (cooperative loading)
+2. Distribute work among warps
+3. Each warp loads its portion from LDS → VGPRs
+4. Perform MFMA operations: `C += A × B`
+5. Accumulate results in VGPRs
+6. Return final `c_block_tile` in registers
+
+---
+
+## Step 5: Store Results to DRAM
+
+```cpp
+// Create a tile window over C for writing results
+auto c_window = make_tile_window(
+    c_dram,                                      // Tensor view over C
+    make_tuple(number<MPerBlock>{}, number<NPerBlock>{}),  // Window size: 256×128
+    {tile_origin_m, tile_origin_n}              // Origin: varies by block
+);
+
+// Store computed tile from VGPRs to DRAM
+store_tile(c_window, c_block_tile);
+```
+
+### C Window Origins for Each Block:
+
+```cpp
+// Block 0: Writes to top-left tile
+c_window origin: (0, 0)      → writes to C[0:255, 0:127]
+
+// Block 1: Writes to bottom-left tile
+c_window origin: (256, 0)    → writes to C[256:511, 0:127]
+
+// Block 2: Writes to top-right tile
+c_window origin: (0, 128)    → writes to C[0:255, 128:255]
+
+// Block 3: Writes to bottom-right tile
+c_window origin: (256, 128)  → writes to C[256:511, 128:255]
+```
+
+### What's Happening:
+
+1. **Create C tile window**: 
+   - Size: 256×128 (matches our block tile size)
+   - Origin: Varies by block - each block writes to its assigned region
+   - This window defines **where** to write the results
+
+2. **Store tile to DRAM**:
+   - `c_block_tile`: Computed results in VGPRs (registers)
+   - `c_window`: Destination window in DRAM
+   - `store_tile()`: Efficiently writes data from registers → DRAM
+
+### The `store_tile` Function:
+
+Recall from our earlier discussion, `store_tile` does:
+
+```cpp
+template <typename TileWindow, typename DistributedTensor>
+void store_tile(TileWindow& tile_window_tmp,
+                const DistributedTensor& dstr_tensor)
+{
+    // 1. Extract tile distribution from distributed tensor
+    using TileDstr = typename DistributedTensor::TileDistribution;
+    
+    // 2. Upgrade simple tile window to one with distribution
+    auto tile_window = make_tile_window(
+        tile_window_tmp.get_bottom_tensor_view(),
+        tile_window_tmp.get_window_lengths(),
+        tile_window_tmp.get_window_origin(),
+        TileDstr{}  // Add distribution info
+    );
+    
+    // 3. Store using vectorized writes
+    tile_window.store(dstr_tensor);
+}
+```
+
+### Memory Flow:
+
+```
+VGPRs (Registers)                    DRAM (Global Memory)
+┌─────────────────────┐              ┌─────────────────────┐
+│  c_block_tile       │              │  C matrix           │
+│  ┌───┬───┬───┬───┐  │              │  ┌───────────────┐  │
+│  │W0 │W1 │W2 │W3 │  │  store_tile  │  │               │  │
+│  ├───┼───┼───┼───┤  │  ==========> │  │  c_window     │  │
+│  │...│...│...│...│  │  vectorized  │  │  (256×128)    │  │
+│  └───┴───┴───┴───┘  │              │  │               │  │
+│  Distributed across  │              │  └───────────────┘  │
+│  threads/warps       │              │  Origin: (0, 0)     │
+└─────────────────────┘              └─────────────────────┘
+
+Each thread writes its portion using vector stores (e.g., float4)
+```
+
+### Store Optimization:
+
+The `store_tile` function:
+- Uses **vectorized stores** (write multiple elements at once)
+- Ensures **coalesced memory access** (adjacent threads write adjacent memory)
+- Respects **tile distribution** (each thread knows what data it owns)
+- Handles **out-of-bounds** checking (for partial tiles at boundaries)
+
+---
+
+## Complete Flow Visualization
+
+Let's trace the complete flow for **Block 0** (other blocks follow the same pattern):
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│ Step 1: Calculate Tile Coverage                                │
+│ ┌─────────────────────────────────────────────────────────────┐ │
+│ │ M=512, N=256, K=64                                          │ │
+│ │ MPerBlock=256, NPerBlock=128, KPerBlock=32                  │ │
+│ │ num_tile_m = ceil(512/256) = 2                              │ │
+│ │ num_tile_n = ceil(256/128) = 2                              │ │
+│ │ Total blocks needed = 2 × 2 = 4 blocks                     │ │
+│ └─────────────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────────┘
+                            ↓
+┌─────────────────────────────────────────────────────────────────┐
+│ Step 2: Map Block to Tile (Block 0 example)                   │
+│ ┌─────────────────────────────────────────────────────────────┐ │
+│ │ Block ID: 0                                                 │ │
+│ │ Tile coordinates: (0, 0) - top-left tile                   │ │
+│ │ Tile origin: (0, 0)                                         │ │
+│ │                                                             │ │
+│ │ (Blocks 1,2,3 get different tile coordinates)              │ │
+│ └─────────────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────────┘
+                            ↓
+┌─────────────────────────────────────────────────────────────────┐
+│ Step 3: Create Tile Windows                                    │
+│ ┌─────────────────────────────────────────────────────────────┐ │
+│ │ a_block_window: 256×32 starting at (0,0) over A            │ │
+│ │ b_block_window: 128×32 starting at (0,0) over B            │ │
+│ │ Windows initially cover K columns 0-31                      │ │
+│ └─────────────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────────┘
+                            ↓
+┌─────────────────────────────────────────────────────────────────┐
+│ Step 4: Execute Block Pipeline (2 K iterations)                │
+│ ┌─────────────────────────────────────────────────────────────┐ │
+│ │ Allocate shared memory (LDS)                                │ │
+│ │ Call block_gemm_pipeline(a_window, b_window, 2, p_smem)    │ │
+│ │                                                             │ │
+│ │ K Iteration 0 (K=0-31):                                     │ │
+│ │   ├─ Load A tile: DRAM → LDS → VGPRs                       │ │
+│ │   ├─ Load B tile: DRAM → LDS → VGPRs                       │ │
+│ │   ├─ Compute: C_partial_0 = A[:, 0:31] × B[:, 0:31]        │ │
+│ │   └─ Move windows: {0, 32}                                  │ │
+│ │                                                             │ │
+│ │ K Iteration 1 (K=32-63):                                    │ │
+│ │   ├─ Load A tile: DRAM → LDS → VGPRs                       │ │
+│ │   ├─ Load B tile: DRAM → LDS → VGPRs                       │ │
+│ │   ├─ Compute: C_partial_1 = A[:, 32:63] × B[:, 32:63]      │ │
+│ │   └─ Accumulate: C_tile = C_partial_0 + C_partial_1        │ │
+│ │                                                             │ │
+│ │ Return c_block_tile in VGPRs (256×128 accumulated result)  │ │
+│ └─────────────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────────┘
+                            ↓
+┌─────────────────────────────────────────────────────────────────┐
+│ Step 5: Store Results                                          │
+│ ┌─────────────────────────────────────────────────────────────┐ │
+│ │ Create c_window: 256×128 starting at (0,0) over C          │ │
+│ │ store_tile(c_window, c_block_tile)                          │ │
+│ │   └─ Write from VGPRs → DRAM (vectorized stores)            │ │
+│ │                                                             │ │
+│ │ Block 0 writes to C[0:255, 0:127]                          │ │
+│ │ (Other blocks write to their respective regions)           │ │
+│ └─────────────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────────┘
+
+All 4 blocks execute in parallel, each computing its assigned 256×128 tile!
+```
+
+---
+
+## Key Concepts Summary
+
+### 1. **Tile Coverage**
+- Determines how many thread blocks are needed
+- Each block processes one tile of the output matrix C
+- Calculated as `ceil(dimension / tile_size)`
+
+### 2. **Block-to-Tile Mapping**
+- Maps linear block ID to 2D tile coordinates
+- Uses column-major ordering for better memory coalescing
+- Each block knows which tile it's responsible for
+
+### 3. **Tile Windows**
+- **Sliding windows** over larger tensor views
+- Define a rectangular region with fixed size and movable origin
+- Provide efficient, structured access to tensor data
+- Can be moved to access different regions (e.g., for K iterations)
+
+### 4. **Memory Hierarchy**
+- **DRAM (Global)**: Largest, slowest - stores full matrices
+- **LDS (Shared)**: Medium, fast - stages tiles for cooperative access
+- **VGPRs (Registers)**: Smallest, fastest - performs computation
+
+### 5. **Data Flow**
+```
+DRAM → Tile Windows → LDS → VGPRs → Computation → VGPRs → DRAM
+  ↑                                                           ↓
+  A, B matrices                                         C matrix
+```
+
+---
+
+## Next Steps
+
+The host-level pipeline has set up the work and delegated to the block-level pipeline. Next, we'll explore:
+- **Block-level pipeline**: How tiles are loaded, distributed to warps, and computed
+- **Warp-level pipeline**: How warps perform MFMA operations
+- **Memory optimization**: LDS usage, bank conflicts, coalescing
+
+The host level provides the **orchestration**, while the block and warp levels provide the **execution**!
+
diff --git a/tutorial/ck_tile/01_naive_gemm/KERNEL_ENTRY_POINT.md b/tutorial/ck_tile/01_naive_gemm/KERNEL_ENTRY_POINT.md
new file mode 100644
index 0000000000..7cd0d06fc5
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/KERNEL_ENTRY_POINT.md
@@ -0,0 +1,464 @@
+# PracticeGemmKernel: Understanding the Kernel Entry Point
+
+This document explains the `PracticeGemmKernel` structure, which serves as the **entry point** for our GEMM GPU kernel. We'll dive deep into how raw memory is transformed into structured tensor views.
+
+## Overview
+
+The `PracticeGemmKernel` is a templated struct that:
+1. Takes raw device memory pointers for matrices A, B, and C
+2. Wraps them into **tensor views** - logical, structured views over physical memory
+3. Dispatches to the host-level pipeline for computation
+
+```cpp
+template <typename Problem_, typename Policy_>
+struct PracticeGemmKernel
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    static constexpr index_t kBlockSize = 256;
+
+    CK_TILE_DEVICE void operator()(const typename Problem::ADataType* p_a,
+                                   const typename Problem::BDataType* p_b,
+                                   typename Problem::CDataType* p_c,
+                                   const index_t M,
+                                   const index_t N,
+                                   const index_t K,
+                                   const index_t stride_a,
+                                   const index_t stride_b,
+                                   const index_t stride_c) const
+    {
+        // Step 1: Create tensor views over raw memory
+        auto a_dram = make_naive_tensor_view<address_space_enum::global>(
+            p_a, make_tuple(M, K), make_tuple(stride_a, 1), number<8>{}, number<1>{});
+
+        auto b_dram = make_naive_tensor_view<address_space_enum::global>(
+            p_b, make_tuple(N, K), make_tuple(stride_b, 1), number<8>{}, number<1>{});
+
+        const auto c_dram = make_naive_tensor_view<address_space_enum::global>(
+            p_c, make_tuple(M, N), make_tuple(stride_c, 1), number<8>{}, number<1>{});
+
+        // Step 2: Dispatch to host-level pipeline
+        PracticeGemmHostPipeline<Problem, Policy>{}(a_dram, b_dram, c_dram);
+    }
+};
+```
+
+---
+
+## What are Tensor Views?
+
+A **tensor view** is a **logical, structured view over raw physical memory**. It doesn't own or allocate memory—it simply provides a way to interpret and access existing memory as a multi-dimensional tensor.
+
+### Key Components of a Tensor View:
+
+1. **Memory Type**: Where the data lives (global/DRAM, LDS/shared, registers)
+2. **Raw Pointer**: Points to the actual data in memory
+3. **Shape**: Dimensions of the tensor (e.g., M×K for matrix A)
+4. **Strides**: How to navigate through memory to access elements
+5. **Guaranteed Vector Length**: How many consecutive elements can be loaded in one vector instruction
+6. **Guaranteed Vector Stride**: The stride of those vectorizable elements
+
+---
+
+## The Memory Abstraction Hierarchy
+
+CK Tile uses a three-layer abstraction to go from raw memory to structured tensors:
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│ Layer 3: TENSOR VIEW                                        │
+│ ┌─────────────────────────────────────────────────────────┐ │
+│ │ • Logical multi-dimensional structure                   │ │
+│ │ • Shape: (M, K) = (256, 32)                            │ │
+│ │ • Strides: (32, 1) for row-major layout                │ │
+│ │ • Provides: operator[], coordinate-based access         │ │
+│ │ • Knows: How to map (i,j) → linear offset              │ │
+│ └─────────────────────────────────────────────────────────┘ │
+│                           ↓ wraps                            │
+│ ┌─────────────────────────────────────────────────────────┐ │
+│ │ Layer 2: BUFFER VIEW                                    │ │
+│ │ ┌─────────────────────────────────────────────────────┐ │ │
+│ │ │ • Linear view of memory                             │ │ │
+│ │ │ • Pointer: p_data_ → device memory                  │ │ │
+│ │ │ • Size: Total number of elements                    │ │ │
+│ │ │ • Address space: global/LDS/generic                 │ │ │
+│ │ │ • Provides: Vectorized loads/stores, bounds checking│ │ │
+│ │ └─────────────────────────────────────────────────────┘ │ │
+│ └─────────────────────────────────────────────────────────┘ │
+│                           ↓ wraps                            │
+│ ┌─────────────────────────────────────────────────────────┐ │
+│ │ Layer 1: RAW PHYSICAL MEMORY                            │ │
+│ │ ┌─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┬─────┐ │ │
+│ │ │ 0.0 │ 1.0 │ 2.0 │ 3.0 │ 4.0 │ 5.0 │ 6.0 │ 7.0 │ ... │ │ │
+│ │ └─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┴─────┘ │ │
+│ │   ↑                                                       │ │
+│ │   p_a (raw pointer from hipMalloc)                       │ │
+│ └─────────────────────────────────────────────────────────┘ │
+└─────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Deep Dive: `make_naive_tensor_view`
+
+Let's break down the function call for matrix A:
+
+```cpp
+auto a_dram = make_naive_tensor_view<address_space_enum::global>(
+    p_a,                    // Raw pointer to device memory
+    make_tuple(M, K),       // Shape: (256, 32)
+    make_tuple(stride_a, 1), // Strides: (32, 1) - row-major
+    number<8>{},            // Guaranteed vector length
+    number<1>{}             // Guaranteed vector stride
+);
+```
+
+### Function Signature:
+
+```cpp
+template <address_space_enum BufferAddressSpace = address_space_enum::generic,
+          memory_operation_enum DstInMemOp      = memory_operation_enum::set,
+          amd_buffer_coherence_enum Coherence   = amd_buffer_coherence_enum::coherence_default,
+          typename DataType,
+          typename... Lengths,
+          typename... Strides,
+          index_t GuaranteedLastDimensionVectorLength = -1,
+          index_t GuaranteedLastDimensionVectorStride = -1>
+CK_TILE_HOST_DEVICE constexpr auto
+make_naive_tensor_view(DataType* __restrict__ p,
+                       const tuple<Lengths...>& lengths,
+                       const tuple<Strides...>& strides,
+                       number<GuaranteedLastDimensionVectorLength> = number<-1>{},
+                       number<GuaranteedLastDimensionVectorStride> = number<-1>{})
+{
+    // Step 1: Create tensor descriptor (shape + stride information)
+    auto desc = make_naive_tensor_descriptor(lengths,
+                                             strides,
+                                             number<GuaranteedLastDimensionVectorLength>{},
+                                             number<GuaranteedLastDimensionVectorStride>{});
+
+    // Step 2: Create buffer view (pointer + size + address space)
+    auto buffer_view =
+        make_buffer_view<BufferAddressSpace, Coherence>(p, desc.get_element_space_size());
+
+    // Step 3: Combine into tensor view
+    return tensor_view<decltype(buffer_view), decltype(desc), DstInMemOp>{buffer_view, desc};
+}
+```
+
+---
+
+## Parameter Breakdown
+
+### 1. **Template Parameter: `address_space_enum::global`**
+
+Specifies where the memory lives:
+- `global`: GPU global memory (DRAM) - slowest but largest
+- `lds`: Local Data Share (shared memory) - fast, limited size
+- `generic`: Generic address space
+- `vgpr`: Vector General Purpose Registers - fastest, smallest
+
+In our case, `global` means the data is in GPU DRAM.
+
+### 2. **`p_a` - Raw Pointer**
+
+The raw device memory pointer returned by `hipMalloc`. Points to the start of the matrix data.
+
+### 3. **`make_tuple(M, K)` - Shape/Lengths**
+
+Defines the logical dimensions of the tensor:
+- For matrix A: `(256, 32)` means 256 rows, 32 columns
+- This is the **logical view**, independent of how data is physically laid out
+
+### 4. **`make_tuple(stride_a, 1)` - Strides**
+
+Defines how to navigate through memory:
+- **Stride for dimension 0 (rows)**: `stride_a = K = 32`
+  - To move to the next row, skip 32 elements
+- **Stride for dimension 1 (columns)**: `1`
+  - To move to the next column, skip 1 element
+
+**Row-major layout example:**
+```
+Memory:  [a₀₀, a₀₁, a₀₂, ..., a₀₃₁, a₁₀, a₁₁, a₁₂, ..., a₁₃₁, ...]
+          ↑                         ↑
+          Row 0 starts here         Row 1 starts here (offset = 32)
+
+To access element A[i][j]:
+    offset = i * stride_a + j * 1
+           = i * 32 + j
+```
+
+### 5. **`number<8>{}` - Guaranteed Last Dimension Vector Length**
+
+This tells the tensor view: **"The last dimension (K) is guaranteed to have at least 8 consecutive elements that can be loaded together in a single vector instruction."**
+
+#### Why is this important?
+
+Modern GPUs can load multiple elements in one instruction (vectorized loads):
+- `float4`: Load 4 floats at once
+- `float8`: Load 8 floats at once (if supported)
+
+By specifying `number<8>{}`, we're telling the system:
+- "You can safely use vector loads of up to 8 elements"
+- "The memory alignment and layout support this"
+
+**Example:**
+```cpp
+// Without vectorization (slow):
+for (int j = 0; j < 8; j++) {
+    data[j] = memory[offset + j];  // 8 separate loads
+}
+
+// With vectorization (fast):
+float8 vec = *reinterpret_cast<float8*>(&memory[offset]);  // 1 load!
+```
+
+### 6. **`number<1>{}` - Guaranteed Last Dimension Vector Stride**
+
+This specifies the **stride between consecutive vectorizable elements** in the last dimension.
+
+- `number<1>{}` means: "Consecutive elements in the last dimension are contiguous in memory (stride = 1)"
+- This confirms that elements `A[i][0], A[i][1], A[i][2], ..., A[i][7]` are stored consecutively
+
+**Why does this matter?**
+
+For efficient vectorized loads, elements must be:
+1. **Contiguous** (stride = 1) ✓
+2. **Aligned** properly in memory
+3. **Within the same cache line** (ideally)
+
+If the stride were `2`, it would mean:
+```
+A[i][0] is at offset 0
+A[i][1] is at offset 2  (not 1!)
+A[i][2] is at offset 4
+```
+This would prevent efficient vectorization.
+
+---
+
+## What is a Buffer View?
+
+A **buffer view** is the middle layer between raw memory and tensor view. It provides:
+
+### Core Responsibilities:
+
+1. **Memory Management**
+   - Holds the raw pointer: `T* p_data_`
+   - Tracks buffer size: `BufferSizeType buffer_size_`
+   - Knows the address space: `global`, `lds`, etc.
+
+2. **Vectorized Access**
+   ```cpp
+   template <typename VectorType>
+   CK_TILE_DEVICE VectorType get(index_t offset);
+   ```
+   - Provides efficient vector loads/stores
+   - Handles alignment requirements
+
+3. **Bounds Checking** (optional)
+   ```cpp
+   template <bool oob_conditional_check = true>
+   CK_TILE_DEVICE auto get(index_t i, index_t linear_offset);
+   ```
+   - Can optionally check if access is within bounds
+   - Returns invalid value (default 0) for out-of-bounds access
+
+4. **Address Space Awareness**
+   - Uses different load/store instructions based on address space
+   - Global memory: `global_load`, `global_store`
+   - LDS: `ds_read`, `ds_write`
+
+### Buffer View Structure:
+
+```cpp
+template <address_space_enum BufferAddressSpace,
+          typename T,
+          typename BufferSizeType,
+          bool InvalidElementUseNumericalZeroValue,
+          amd_buffer_coherence_enum Coherence>
+struct buffer_view
+{
+    T* p_data_;                              // Raw pointer
+    BufferSizeType buffer_size_;             // Total elements
+    remove_cvref_t<T> invalid_element_value_; // Value for OOB access
+
+    // Access operators
+    const T& operator[](index_t i) const;    // Read
+    T& operator()(index_t i);                // Write
+    
+    // Vectorized access
+    template <typename VectorType>
+    VectorType get(index_t offset);
+};
+```
+
+---
+
+## Visual Example: Matrix A Memory Layout
+
+Let's visualize how matrix A (256×32, fp16) is organized:
+
+### Raw Physical Memory (Linear):
+```
+GPU DRAM Address Space:
+┌─────────────────────────────────────────────────────────────────┐
+│ Byte 0                                                          │
+│ ↓                                                               │
+│ [a₀₀][a₀₁][a₀₂]...[a₀₃₁][a₁₀][a₁₁][a₁₂]...[a₁₃₁][a₂₀]...     │
+│  ↑                        ↑                                     │
+│  Row 0 (32 elements)      Row 1 (32 elements)                  │
+│                                                                 │
+│  Total: 256 rows × 32 cols × 2 bytes/element = 16,384 bytes   │
+└─────────────────────────────────────────────────────────────────┘
+         ↑
+         p_a (raw pointer)
+```
+
+### Buffer View Layer:
+```
+buffer_view<address_space_enum::global, fp16_t, ...>
+┌─────────────────────────────────────────────────────────────────┐
+│ p_data_ = p_a                                                   │
+│ buffer_size_ = 256 × 32 = 8,192 elements                       │
+│ address_space = global (DRAM)                                   │
+│                                                                 │
+│ Provides:                                                       │
+│ • Linear indexing: buffer_view[i] → element at offset i        │
+│ • Vectorized loads: get<float4>(offset) → load 4 fp16s at once│
+│ • Bounds checking: is offset < buffer_size_?                   │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### Tensor View Layer:
+```
+tensor_view<buffer_view, tensor_descriptor>
+┌─────────────────────────────────────────────────────────────────┐
+│ Shape: (256, 32)                                                │
+│ Strides: (32, 1)                                                │
+│ Guaranteed vector length: 8                                     │
+│ Guaranteed vector stride: 1                                     │
+│                                                                 │
+│ Logical 2D View:                                                │
+│     Col:  0    1    2   ...  31                                │
+│   Row 0: [a₀₀][a₀₁][a₀₂] ... [a₀₃₁]  ← Can vector load 8 at once│
+│   Row 1: [a₁₀][a₁₁][a₁₂] ... [a₁₃₁]                           │
+│   Row 2: [a₂₀][a₂₁][a₂₂] ... [a₂₃₁]                           │
+│   ...                                                           │
+│   Row 255: [a₂₅₅,₀] ... [a₂₅₅,₃₁]                             │
+│                                                                 │
+│ Provides:                                                       │
+│ • Multi-dimensional indexing: A[i][j]                          │
+│ • Coordinate transformation: (i,j) → linear offset = i*32 + j  │
+│ • Tile window creation: Extract sub-tensors                    │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Complete Flow: Raw Memory → Tensor View
+
+Let's trace the complete transformation for matrix A:
+
+### Step 1: Kernel Launch (Host Side)
+```cpp
+// On host: Allocate device memory
+hipMalloc(&p_a, M * K * sizeof(fp16_t));  // Returns raw pointer
+
+// Launch kernel
+kernel<<<grid, block>>>(p_a, p_b, p_c, M, N, K, ...);
+```
+
+### Step 2: Inside Kernel (Device Side)
+```cpp
+// Receive raw pointer
+const fp16_t* p_a;  // Points to GPU DRAM
+
+// Step 2a: Create tensor descriptor
+auto desc = make_naive_tensor_descriptor(
+    make_tuple(256, 32),    // Shape
+    make_tuple(32, 1),      // Strides
+    number<8>{},            // Vector length
+    number<1>{}             // Vector stride
+);
+// desc now knows: "This is a 256×32 tensor, row-major, vectorizable by 8"
+
+// Step 2b: Create buffer view
+auto buffer_view = make_buffer_view<address_space_enum::global>(
+    p_a,                    // Raw pointer
+    256 * 32                // Total elements
+);
+// buffer_view now wraps p_a with size and address space info
+
+// Step 2c: Create tensor view
+auto a_dram = tensor_view{buffer_view, desc};
+// a_dram now provides structured, multi-dimensional access to p_a
+```
+
+### Step 3: Using the Tensor View
+```cpp
+// Access element A[i][j]
+auto value = a_dram[make_tuple(i, j)];
+
+// Create a tile window (sub-tensor)
+auto tile = make_tile_window(
+    a_dram,
+    make_tuple(16, 16),  // 16×16 tile
+    make_tuple(0, 0)     // Starting at origin
+);
+
+// Load tile into registers with vectorization
+auto tile_data = load_tile(tile);  // Uses vector loads internally!
+```
+
+---
+
+## Why This Abstraction?
+
+### Benefits:
+
+1. **Type Safety**: Can't accidentally access wrong dimensions
+2. **Performance**: Compiler knows about vectorization opportunities
+3. **Flexibility**: Same code works for different memory spaces (DRAM, LDS, registers)
+4. **Maintainability**: Logical structure separate from physical layout
+5. **Optimization**: Guaranteed vector properties enable aggressive optimizations
+
+### Example: Without Tensor Views (Manual Indexing)
+```cpp
+// Ugly, error-prone, hard to optimize:
+for (int i = 0; i < 16; i++) {
+    for (int j = 0; j < 16; j++) {
+        float val = p_a[tile_offset_i * stride_a + tile_offset_j + i * stride_a + j];
+        // Hope the compiler vectorizes this? 🤞
+    }
+}
+```
+
+### Example: With Tensor Views (Clean, Optimized)
+```cpp
+// Clean, safe, automatically vectorized:
+auto tile = make_tile_window(a_dram, make_tuple(16, 16), origin);
+auto tile_data = load_tile(tile);  // Vectorized loads guaranteed!
+```
+
+---
+
+## Summary
+
+The `PracticeGemmKernel` entry point transforms raw GPU memory into structured, multi-dimensional tensors through a three-layer abstraction:
+
+1. **Raw Memory**: Linear array of bytes in GPU DRAM
+2. **Buffer View**: Adds size, address space, and vectorized access
+3. **Tensor View**: Adds shape, strides, and multi-dimensional indexing
+
+This abstraction enables:
+- ✅ Clean, readable code
+- ✅ Type-safe multi-dimensional access
+- ✅ Automatic vectorization
+- ✅ Flexible memory space handling
+- ✅ Efficient tile-based computation
+
+The tensor views created here are then passed to the host-level pipeline, which orchestrates the block-level GEMM computation!
+
diff --git a/tutorial/ck_tile/01_naive_gemm/README.md b/tutorial/ck_tile/01_naive_gemm/README.md
new file mode 100644
index 0000000000..f2caf7d993
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/README.md
@@ -0,0 +1,150 @@
+# CK Tile Practice GEMM Example
+
+This is a practice implementation of a GEMM (General Matrix Multiplication) kernel using the CK Tile API. It demonstrates the fundamental concepts of GPU kernel development using CK Tile's hierarchical tile system.
+
+## CK Tile API Structure
+
+In the composable_kernel library's ck_tile API, **A Kernel is composed of a Problem, a Policy and an Epilogue**:
+
+1. **Problem** describes the shape, data type, data layout, precision of our GEMM matrices
+2. **Policy** describes how the data in the matrix (or tile) is mapped to the threads
+3. **Epilogue** describes additional computation work performed after the gemm computations (this example does not have an epilogue)
+
+## Overview
+
+This example implements a complete GEMM kernel `C = A × B` using the CK Tile framework, showcasing:
+
+- **Problem Setup** - Setting up the problem (input/output shapes, data types, mathematical operations), composing a kernel (pipeline, policy, epilogue), kernel launch
+- **Block-level Pipelining** - creating tensor views, dispatching to block-level GEMM
+- **Block-level GEMM Computation** - Block tiles, tile window creation, loading/storing to DRAM and Register memory
+- **Warp-level GEMM Computation** - Warp tiles, MFMA level computation
+
+## Problem Setup and Data Flow
+
+### Problem Size Configuration
+We set the problem size using the M, N and K variables:
+```cpp
+ck_tile::index_t M = 1024;   // Number of rows in A and C
+ck_tile::index_t N = 512;  // Number of columns in B and C
+ck_tile::index_t K = 256;  // Number of columns in A, rows in B
+```
+
+### Host Matrix Creation
+Three host matrices A (M×K), B (N×K) and C (M×N) are created, initialized on the CPU and copied over to the GPU global/DRAM memory:
+```cpp
+// Host tensors with proper strides
+ck_tile::HostTensor<ADataType> a_host(a_lengths, a_strides);  // M × K
+ck_tile::HostTensor<BDataType> b_host(b_lengths, b_strides);  // N × K
+ck_tile::HostTensor<CDataType> c_host(c_lengths, c_strides);  // M × N
+
+// Initialize with random data
+ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_host);
+ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5.f, 5.f}(b_host);
+
+// Allocate device memory and transfer data
+ck_tile::DeviceMem a_device(a_host);
+a_device.ToDevice(a_host.data());
+```
+
+### PracticeGemmShape Configuration
+A PracticeGemmShape struct holds the dimension of each BlockTile and WaveTile:
+
+```cpp
+using BlockTile = ck_tile::sequence<256, 128, 32>;  // M, N, K per block
+using WaveTile  = ck_tile::sequence<16, 16, 16>;   // M, N, K per wave
+```
+- A BlockTile of size MxK (256x32) on A matrix and NxK (128x32) on B matrix. A WaveTile of size MxN (16x16) on C matrix.
+
+
+- BlockTiles iterate in K dimension to fetch data required for computing region of C covered by C's block tile.
+- BlockTiles are further subdivided into WarpTiles.
+- WarpTiles over A and B similarly work together to calculate the WarpTile of C.
+
+### Problem and Policy Composition
+```cpp
+// A Problem is composed from Shape and info about the data
+using PracticeGemmHostProblem = ck_tile::
+    PracticeGemmHostProblem<ADataType, BDataType, CDataType, AccDataType, PracticeGemmShape>;
+
+// A Policy is created describing data-to-thread mapping
+using PracticeGemmHostPolicy = ck_tile::PracticeGemmHostPolicy;
+
+// A Kernel is then composed of Problem and Policy
+using gemm_kernel = ck_tile::PracticeGemmKernel<PracticeGemmHostProblem, PracticeGemmHostPolicy>;
+```
+
+### Kernel Launch
+`ck_tile::launch_kernel()` is used to launch the kernel on device. It calls the `operator()` function of `PracticeGemmKernel{}`:
+```cpp
+float ave_time = ck_tile::launch_kernel(
+    ck_tile::stream_config{nullptr, true, 0, 0, 1},
+    ck_tile::make_kernel<kBlockSize, kBlockPerCU>(
+        gemm_kernel{},  // Kernel composed of Problem + Policy
+        kGridSize,      // Grid dimensions
+        kBlockSize,     // Block dimensions
+        0,              // Dynamic shared memory
+        // Kernel arguments: device buffers and problem dimensions
+        a_device.GetDeviceBuffer(), b_device.GetDeviceBuffer(), c_device.GetDeviceBuffer(),
+        M, N, K, stride_a, stride_b, stride_c));
+```
+
+### Result Verification
+The results from the kernel are compared with results from CPU based computation function:
+```cpp
+// CPU reference implementation
+ck_tile::HostTensor<CDataType> c_host_ref(c_lengths, c_strides);
+reference_basic_gemm<ADataType, BDataType, AccDataType, CDataType>(a_host, b_host, c_host_ref);
+
+// Device results
+ck_tile::HostTensor<CDataType> c_host_dev(c_lengths, c_strides);
+
+// Verify correctness
+bool pass = ck_tile::check_err(c_host_dev, c_host_ref);
+```
+
+### Runtime Flow
+
+The main program (`practice_gemm.cpp`) is the entry point for the runtime flow:
+
+```cpp
+int main()
+{
+    // 1. Define data types and problem sizes
+    using ADataType = ck_tile::half_t;
+    ck_tile::index_t M = 2048, N = 1024, K = 512;
+
+    // 2. Create host tensors and initialize
+    ck_tile::HostTensor<ADataType> a_host(a_lengths, a_strides);
+    ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5.f, 5.f}(a_host);
+
+    // 3. Allocate device memory and transfer data
+    ck_tile::DeviceMem a_device(a_host);
+
+    // 4. Configure tile shapes
+    using BlockTile = ck_tile::sequence<256, 128, 32>;
+    using WaveTile  = ck_tile::sequence<16, 16, 16>;
+
+    // 5. Launch kernel
+    using gemm_kernel = ck_tile::PracticeGemmKernel<Problem, Policy>;
+    float ave_time = ck_tile::launch_kernel(/*...*/);
+
+    // 6. Verify results
+    bool pass = verify_results(a_host, b_host, c_host);
+
+    // 7. Print performance metrics
+    print_performance_metrics(ave_time, M, N, K);
+}
+```
+
+## Building and Running
+
+```bash
+# From composable_kernel root directory
+mkdir build && cd build
+sh ../script/cmake-ck-dev.sh ../ <arch>
+make tile_example_practice_gemm -j
+
+# Run with sample sizes
+./bin/tile_example_practice_gemm
+```
+This example serves as a foundation for understanding more complex GEMM implementations and optimization strategies in the CK Tile framework.
diff --git a/tutorial/ck_tile/01_naive_gemm/WALKTHROUGH.md b/tutorial/ck_tile/01_naive_gemm/WALKTHROUGH.md
new file mode 100644
index 0000000000..d0b8400b9c
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/WALKTHROUGH.md
@@ -0,0 +1,506 @@
+# Practice GEMM: Step-by-Step Code Walkthrough
+
+This document provides a detailed walkthrough of `practice_gemm.cpp`, explaining each step of implementing a GEMM (General Matrix Multiplication) kernel using the CK Tile API.
+
+## Overview
+
+We'll implement `C = A × B` where:
+- `A` is an `M × K` matrix
+- `B` is an `N × K` matrix (note: transposed layout)
+- `C` is an `M × N` matrix
+
+The implementation uses a hierarchical tiling strategy with two levels:
+1. **Block Tiles**: Processed by thread blocks
+2. **Wave Tiles**: Processed by warps (wavefronts) within blocks
+
+---
+
+## Step 1: Define Data Types
+
+```cpp
+using ADataType   = ck_tile::half_t;
+using BDataType   = ck_tile::half_t;
+using CDataType   = float;
+using AccDataType = float;
+```
+
+**What's happening:**
+- We use `half_t` (FP16) for input matrices A and B.
+- We use `float` (FP32) for output matrix C and accumulation for numerical accuracy
+- In typical CK examples, this information is part of a `GemmConfig` struct, but here we define it directly for simplicity
+---
+
+## Step 2: Define Problem Size
+
+```cpp
+ck_tile::index_t M = 512;
+ck_tile::index_t N = 256;
+ck_tile::index_t K = 64;
+ck_tile::index_t verification = 1;
+
+ck_tile::index_t stride_a = K;
+ck_tile::index_t stride_b = K;
+ck_tile::index_t stride_c = N;
+```
+
+**What's happening:**
+- `M = 512`: Number of rows in A and C
+- `N = 256`: Number of columns in B and C
+- `K = 64`: Inner dimension (columns of A, rows of B)
+- Strides define memory layout (row-major for A and C, transposed for B)
+
+**Memory Layout:**
+```
+Matrix A (M×K):        Matrix B (N×K):        Matrix C (M×N):
+[512 rows]             [256 rows]             [512 rows]
+[64 cols]              [64 cols]              [256 cols]
+stride = K             stride = K             stride = N
+```
+
+---
+
+## Step 3: Create Host Tensors
+
+```cpp
+auto a_lengths = std::array<ck_tile::index_t, 2>{M, K};
+auto b_lengths = std::array<ck_tile::index_t, 2>{N, K};
+auto c_lengths = std::array<ck_tile::index_t, 2>{M, N};
+
+auto a_strides = std::array<ck_tile::index_t, 2>{stride_a, 1};
+auto b_strides = std::array<ck_tile::index_t, 2>{stride_b, 1};
+auto c_strides = std::array<ck_tile::index_t, 2>{stride_c, 1};
+
+ck_tile::HostTensor<ADataType> a_host(a_lengths, a_strides);
+ck_tile::HostTensor<BDataType> b_host(b_lengths, b_strides);
+ck_tile::HostTensor<CDataType> c_host(c_lengths, c_strides);
+```
+
+**What's happening:**
+- We create three tensors on the host (CPU) memory
+- Each tensor is defined by its shape (`lengths`) and memory layout (`strides`)
+- `HostTensor` is a CK Tile utility class that manages CPU memory
+
+**Stride explanation:**
+- For A: `stride_a = K` means moving to the next row requires skipping K elements
+- For B: `stride_b = K` means B is stored in transposed format
+- For C: `stride_c = N` means row-major layout
+
+---
+
+## Step 4: Initialize Tensors with Random Data
+
+```cpp
+ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_host);
+ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_host);
+c_host.SetZero();
+```
+
+**What's happening:**
+- A and B are filled with random values in the range [-5.0, 5.0]
+- C is initialized to zero (will store the output)
+
+**Optional: Print Tensor Contents**
+```cpp
+// Commented out in the code, but available for debugging:
+// a_host.print_first_n(10);  // Print first 10 elements of A
+```
+
+The `print_first_n()` helper function can display tensor contents for debugging purposes.
+
+---
+
+## Step 5: Allocate Device Memory and Transfer Data
+
+```cpp
+ck_tile::DeviceMem a_device(a_host);
+ck_tile::DeviceMem b_device(b_host);
+ck_tile::DeviceMem c_device(c_host);
+```
+
+**What's happening:**
+- `DeviceMem` allocates GPU memory matching the size of host tensors
+- The constructor **automatically transfers data from host to device**
+- This is a convenience wrapper around `hipMalloc` and `hipMemcpy`
+
+**Memory Flow:**
+```
+CPU (Host)              GPU (Device)
+┌─────────┐            ┌─────────┐
+│ a_host  │ ────────>  │a_device │
+│ b_host  │ ────────>  │b_device │
+│ c_host  │ ────────>  │c_device │
+└─────────┘            └─────────┘
+```
+
+---
+
+## Step 6: Configure Hierarchical Tiling
+
+```cpp
+using BlockTile = ck_tile::sequence<256, 128, 32>;
+using WaveTile  = ck_tile::sequence<16, 16, 16>;
+```
+
+**What's happening:**
+- We define a two-level tiling hierarchy for the GEMM computation
+
+### Block Tile (256 × 128 × 32)
+- **256**: M dimension per block (rows of A and C)
+- **128**: N dimension per block (columns of B and C)
+- **32**: K dimension per block (inner dimension)
+- Each block tile is processed by one **thread block** (256 threads)
+
+### Wave Tile (16 × 16 × 16)
+- **16 × 16**: Output tile dimensions (M × N) per warp iteration
+- **16**: K dimension per warp iteration
+- Each wave tile is processed by one **warp** (64 threads on AMD GPUs)
+
+**Important:** The WaveTile (16×16×16) is NOT the same as the MFMA instruction size (32×32×8). The WaveTile represents the work done per warp per iteration, while MFMA is the underlying hardware instruction. Multiple MFMA operations may be needed to compute one wave tile
+
+**Important Note:**
+In this example, the problem size (256 × 128 × 32) is **identical** to the block tile size, so only **one thread block** is needed to compute the entire problem.
+
+### Tiling Visualization:
+
+#### Matrix A (M × K = 256 × 32):
+```
+┌─────────────────────────────────────┐
+│  One Block Tile (256 × 32)          │
+│  ┌────┬────┐                        │
+│  │16×│16× │  ← Wave tiles (16×16)   │
+│  │ 16│ 16 │     in M×K space        │
+│  ├────┼────┤                        │
+│  │    │    │                        │
+│  ├────┼────┤                        │
+│  │ .. │ .. │  16 tiles in M         │
+│  ├────┼────┤  2 tiles in K          │
+│  │    │    │                        │
+│  └────┴────┘                        │
+│                                     │
+└─────────────────────────────────────┘
+```
+
+#### Matrix B (N × K = 128 × 32):
+```
+┌──────────────────────────────┐
+│  One Block Tile (128 × 32)   │
+│  ┌────┬────┐                 │
+│  │16×│16× │  ← Wave tiles    │
+│  │ 16│ 16 │     (16×16)      │
+│  ├────┼────┤                 │
+│  │    │    │                 │
+│  ├────┼────┤  8 tiles in N   │
+│  │ .. │ .. │  2 tiles in K   │
+│  ├────┼────┤                 │
+│  │    │    │                 │
+│  └────┴────┘                 │
+└──────────────────────────────┘
+```
+
+#### Matrix C (M × N = 256 × 128) - Output:
+```
+┌─────────────────────────────────────────────────┐
+│  One Block Tile (256 × 128)                     │
+│                                                  │
+│  ┌────┬────┬────┬────┬────┬────┬────┬────┐     │
+│  │16× │    │    │    │    │    │    │    │     │
+│  │ 16 │    │    │    │    │    │    │    │     │
+│  ├────┼────┼────┼────┼────┼────┼────┼────┤     │
+│  │    │    │    │    │    │    │    │    │     │
+│  ├────┼────┼────┼────┼────┼────┼────┼────┤     │
+│  │    │    │    │    │    │    │    │    │     │
+│  ├────┼────┼────┼────┼────┼────┼────┼────┤     │
+│  │ .. │ .. │ .. │ .. │ .. │ .. │ .. │ .. │     │
+│  ├────┼────┼────┼────┼────┼────┼────┼────┤     │
+│  │    │    │    │    │    │    │    │    │     │
+│  └────┴────┴────┴────┴────┴────┴────┴────┘     │
+│                                                  │
+│  16 wave tiles in M direction                   │
+│  8 wave tiles in N direction                    │
+│  Total: 128 wave tiles (16×16 each)             │
+└─────────────────────────────────────────────────┘
+```
+
+#### How Wave Tiles Combine (C = A × B):
+```
+Matrix A          Matrix B (stored transposed N×K)          Matrix C
+(256×32)          (128×32)                                  (256×128)
+
+Row of A tiles:   Row of B tiles:                One wave tile in C:
+┌────┬────┐      ┌────┬────┐                    ┌────┐
+│ A₀ │ A₁ │  ×   │ B₀ │ B₁ │                =   │ C  │ (16×16)
+└────┴────┘      └────┴────┘                    └────┘
+  16×16 each       16×16 each
+
+Computation: C = A₀×B₀ᵀ + A₁×B₁ᵀ
+             ↑             ↑
+          K=0..15      K=16..31
+          
+Each wave tile in C is computed by:
+- Taking one row of wave tiles from A (2 tiles along K)
+- Taking one row of wave tiles from B (2 tiles along K)
+  Note: B is stored transposed (N×K), so a "row" in storage corresponds 
+  to a "column" in the logical B^T matrix used in computation
+- Performing dot product: Σ(A_k × B_k^T) for k=0,1
+```
+
+**Key Insight:**
+- Each **wave tile in C** (16×16) requires a **dot product** of 2 wave tiles from A and 2 wave tiles from B
+- Since B is stored transposed (N×K layout), we access **rows** of B tiles in memory
+- This is the fundamental operation repeated across all 128 wave tiles in C
+- Each warp computes one wave tile using MFMA instructions
+
+---
+
+## Step 7: Create Shape, Problem, and Policy Structs
+
+```cpp
+using PracticeGemmShape = ck_tile::PracticeGemmShape<BlockTile, WaveTile>;
+std::cout << "PracticeGemmShape: " << PracticeGemmShape::GetName() << std::endl;
+
+using PracticeGemmHostProblem = ck_tile::
+    PracticeGemmHostProblem<ADataType, BDataType, CDataType, AccDataType, PracticeGemmShape>;
+
+using PracticeGemmHostPolicy = ck_tile::PracticeGemmHostPolicy;
+```
+
+**What's happening:**
+
+### 1. **Shape Struct**
+Encapsulates all tile shape information (BlockTile and WaveTile dimensions).
+
+### 2. **Problem Struct**
+Holds complete problem description:
+- Data types (ADataType, BDataType, CDataType, AccDataType)
+- Shape information (BlockTile, WaveTile)
+
+In more complex examples, this would also include:
+- Data layouts (row-major, column-major)
+- Mathematical operations (e.g., transposed GEMM)
+
+### 3. **Policy Struct**
+Describes data movement and thread-to-data mapping:
+- Currently contains `MakeBlock2TileMap()`: Maps thread block IDs to tile positions
+- In more complex kernels, includes:
+  - DRAM access patterns
+  - LDS (Local Data Share) usage strategies
+  - Thread distribution within blocks
+
+**CK Tile Design Pattern:**
+```
+Kernel = Problem + Policy + Epilogue
+         ↑         ↑        ↑
+      (What)    (How)   (Post-processing)
+```
+
+---
+
+## Step 8: Calculate Grid and Block Dimensions
+
+```cpp
+ck_tile::index_t kGridSize = ck_tile::integer_divide_ceil(M, PracticeGemmShape::BlockTile_M) *
+                             ck_tile::integer_divide_ceil(N, PracticeGemmShape::BlockTile_N);
+
+std::cout << "kGridSize: " << kGridSize << std::endl;
+
+constexpr ck_tile::index_t kBlockSize = 256;
+constexpr ck_tile::index_t kBlockPerCU = 1;
+```
+
+**What's happening:**
+
+### Grid Size Calculation
+```cpp
+kGridSize = ceil(M / BlockTile_M) × ceil(N / BlockTile_N)
+          = ceil(512 / 256) × ceil(256 / 128)
+          = 2 × 2
+          = 4 thread blocks
+```
+
+Our problem requires **4 thread blocks** to cover the entire output matrix C (2 blocks in M direction, 2 blocks in N direction).
+
+### Block Configuration
+- `kBlockSize = 256`: Each thread block has 256 threads
+  - 256 threads / 64 threads per warp = **4 warps per block**
+- `kBlockPerCU = 1`: Launch 1 block per Compute Unit (for simplicity)
+
+**Thread Hierarchy:**
+```
+GPU
+└── 1 Thread Block (Grid)
+    └── 256 Threads
+        ├── Warp 0 (threads 0-63)
+        ├── Warp 1 (threads 64-127)
+        ├── Warp 2 (threads 128-191)
+        └── Warp 3 (threads 192-255)
+```
+
+---
+
+## Step 9: Create and Launch the Kernel
+
+```cpp
+using gemm_kernel =
+    ck_tile::PracticeGemmKernel<PracticeGemmHostProblem, PracticeGemmHostPolicy>;
+
+float ave_time = ck_tile::launch_kernel(
+    ck_tile::stream_config{nullptr, true, 0, 0, 1},
+    ck_tile::make_kernel<kBlockPerCU>(gemm_kernel{},
+                                      kGridSize,
+                                      kBlockSize,
+                                      0,
+                                      static_cast<ADataType*>(a_device.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_device.GetDeviceBuffer()),
+                                      M,
+                                      N,
+                                      K,
+                                      stride_a,
+                                      stride_b,
+                                      stride_c));
+```
+
+**What's happening:**
+
+### 1. Kernel Composition
+```cpp
+using gemm_kernel = ck_tile::PracticeGemmKernel<Problem, Policy>;
+```
+The kernel is composed from Problem and Policy structs, following the CK Tile design pattern.
+
+### 2. Kernel Launch
+`launch_kernel()` is a CK Tile utility that:
+- Launches the GPU kernel using HIP runtime
+- Measures execution time
+- Returns average execution time in milliseconds
+
+### 3. Launch Parameters
+- **Stream config**: `{nullptr, true, 0, 0, 1}` - default stream, timing enabled
+- **Grid size**: `kGridSize = 1` - number of thread blocks
+- **Block size**: `kBlockSize = 256` - threads per block
+- **Shared memory**: `0` - no dynamic shared memory in this example
+- **Kernel arguments**: Device pointers and problem dimensions
+
+### 4. Kernel Execution Flow
+```
+launch_kernel() calls gemm_kernel.operator()()
+    ↓
+PracticeGemmKernel::operator()
+    ↓
+Creates tensor views over device memory
+    ↓
+Calls block-level pipeline
+    ↓
+Block pipeline calls warp-level pipeline
+    ↓
+Warp pipeline calls MFMA instructions
+    ↓
+Results written back to C matrix
+```
+
+---
+
+## Step 10: Verify Results
+
+```cpp
+auto pass = true;
+
+if(verification)
+{
+    // Reference gemm on CPU
+    ck_tile::HostTensor<CDataType> c_host_ref(c_lengths, c_strides);
+    reference_basic_gemm<ADataType, BDataType, AccDataType, CDataType>(
+        a_host, b_host, c_host_ref);
+    
+    // Copy GPU results back to host
+    ck_tile::HostTensor<CDataType> c_host_dev(c_lengths, c_strides);
+    c_device.FromDevice(c_host_dev.mData.data());
+    
+    // Compare results
+    pass &= ck_tile::check_err(c_host_dev, c_host_ref, "Error: Incorrect results!", 1e-3, 1e-3);
+    std::cout << "valid:" << (pass ? "y" : "n") << std::endl;
+}
+```
+
+**What's happening:**
+
+### 1. CPU Reference Implementation
+```cpp
+reference_basic_gemm<...>(a_host, b_host, c_host_ref);
+```
+Computes GEMM on CPU using a simple nested loop implementation (ground truth).
+
+### 2. Copy GPU Results to Host
+```cpp
+c_device.FromDevice(c_host_dev.mData.data());
+```
+Transfers the computed result from GPU memory back to CPU for comparison.
+
+### 3. Error Checking
+```cpp
+ck_tile::check_err(c_host_dev, c_host_ref, "Error: Incorrect results!", 1e-3, 1e-3);
+```
+Compares GPU and CPU results element-wise with tolerance:
+- **Relative error**: 1e-3 (0.1%)
+- **Absolute error**: 1e-3
+
+**Verification Flow:**
+```
+CPU                     GPU
+┌─────────┐            ┌─────────┐
+│ a_host  │ ────────>  │a_device │
+│ b_host  │ ────────>  │b_device │
+└─────────┘            └─────────┘
+     │                      │
+     ↓                      ↓
+reference_gemm()       GPU kernel
+     │                      │
+     ↓                      ↓
+┌──────────┐          ┌──────────┐
+│c_host_ref│          │c_device  │
+└──────────┘          └──────────┘
+     │                      │
+     │                      ↓
+     │                 FromDevice()
+     │                      │
+     ↓                      ↓
+     └────> check_err() <───┘
+                 │
+                 ↓
+            Pass/Fail
+```
+
+---
+
+## Complete Execution Flow Summary
+
+```
+1. Define data types (FP16 inputs, FP32 output)
+   ↓
+2. Set problem size (M=256, N=128, K=32)
+   ↓
+3. Create host tensors and initialize with random data
+   ↓
+4. Allocate device memory and transfer data (CPU → GPU)
+   ↓
+5. Configure hierarchical tiling (BlockTile, WaveTile)
+   ↓
+6. Create Shape, Problem, and Policy structs
+   ↓
+7. Calculate grid/block dimensions (1 block, 256 threads)
+   ↓
+8. Compose and launch kernel (Problem + Policy)
+   ↓
+9. Execute GEMM on GPU
+   │  ├─ Block-level pipeline
+   │  ├─ Warp-level pipeline
+   │  └─ MFMA instructions
+   ↓
+10. Verify results (compare GPU vs CPU reference)
+    ↓
+11. Calculate and print performance metrics
+    ↓
+12. Return success/failure
+```
+
+---
\ No newline at end of file
diff --git a/tutorial/ck_tile/01_naive_gemm/block_level/practice_gemm_block_pipeline_agmem_bgmem_creg.hpp b/tutorial/ck_tile/01_naive_gemm/block_level/practice_gemm_block_pipeline_agmem_bgmem_creg.hpp
new file mode 100644
index 0000000000..31fa4ac3eb
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/block_level/practice_gemm_block_pipeline_agmem_bgmem_creg.hpp
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename Policy = PracticeGemmBlockPolicy>
+struct PracticeGemmBlockPipelineAGmemBGmemCreg
+{
+    using ADataType   = typename Problem::ADataType;
+    using BDataType   = typename Problem::BDataType;
+    using CDataType   = typename Problem::CDataType;
+    using AccDataType = typename Problem::AccDataType;
+
+    using BlockTile = typename Problem::Shape::BlockTile;
+    using WaveTile  = typename Problem::Shape::WaveTile;
+
+    static constexpr index_t MPerBlock = BlockTile::at(number<0>{});
+    static constexpr index_t NPerBlock = BlockTile::at(number<1>{});
+    static constexpr index_t KPerBlock = BlockTile::at(number<2>{});
+
+    static constexpr index_t MPerWave = WaveTile::at(number<0>{});
+    static constexpr index_t NPerWave = WaveTile::at(number<1>{});
+    static constexpr index_t KPerWave = WaveTile::at(number<2>{});
+
+    using BlockGemm =
+        remove_cvref_t<decltype(Policy::template GetPracticeWaveGemmPipeline<Problem>())>;
+
+    CK_TILE_HOST_DEVICE static constexpr ck_tile::index_t GetStaticLDSSize()
+    {
+        return integer_divide_ceil(
+                   sizeof(ADataType) *
+                       Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
+                   16) *
+                   16 +
+               sizeof(BDataType) *
+                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+    }
+
+    template <typename ADramBlockWindowTmp, typename BDramBlockWindowTmp>
+    CK_TILE_HOST_DEVICE auto operator()(const ADramBlockWindowTmp& a_dram_block_window_tmp,
+                                        const BDramBlockWindowTmp& b_dram_block_window_tmp,
+                                        index_t num_loop,
+                                        void* p_smem) const
+    {
+        static_assert(
+            std::is_same_v<ADataType, remove_cvref_t<typename ADramBlockWindowTmp::DataType>> &&
+                std::is_same_v<BDataType, remove_cvref_t<typename BDramBlockWindowTmp::DataType>>,
+            "wrong!");
+
+        static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[number<0>{}] &&
+                          KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[number<1>{}],
+                      "wrong!");
+
+        // -----------------------------------------------------------------------------------------
+        // Definitions of all needed tiles
+
+        // A tile in LDS
+        ADataType* p_a_lds = static_cast<ADataType*>(p_smem);
+
+        constexpr auto a_lds_block_desc = Policy::template MakeALdsBlockDescriptor<Problem>();
+
+        auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
+
+        constexpr index_t a_lds_block_space_size_aligned =
+            integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
+            16;
+
+        // B tile in LDS
+        BDataType* p_b_lds = static_cast<BDataType*>(
+            static_cast<void*>(static_cast<char*>(p_smem) + a_lds_block_space_size_aligned));
+
+        constexpr auto b_lds_block_desc = Policy::template MakeBLdsBlockDescriptor<Problem>();
+
+        auto b_lds_block = make_tensor_view<address_space_enum::lds>(p_b_lds, b_lds_block_desc);
+
+        // A DRAM tile window for load
+        auto a_copy_dram_window =
+            make_tile_window(a_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                             a_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeADramTileDistribution<Problem>());
+
+        // A LDS tile window for store
+        auto a_copy_lds_window =
+            make_tile_window(a_lds_block,
+                             make_tuple(number<MPerBlock>{}, number<KPerBlock>{}),
+                             {0, 0},
+                             a_copy_dram_window.get_tile_distribution());
+
+        // B DRAM tile window for load
+        auto b_copy_dram_window =
+            make_tile_window(b_dram_block_window_tmp.get_bottom_tensor_view(),
+                             make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                             b_dram_block_window_tmp.get_window_origin(),
+                             Policy::template MakeBDramTileDistribution<Problem>());
+
+        // B LDS tile window for store
+        auto b_copy_lds_window =
+            make_tile_window(b_lds_block,
+                             make_tuple(number<NPerBlock>{}, number<KPerBlock>{}),
+                             {0, 0},
+                             b_copy_dram_window.get_tile_distribution());
+
+        // A LDS tile for block GEMM
+        auto a_lds_gemm_window = make_tile_window(
+            a_lds_block, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {0, 0});
+
+        // B LDS tile for block GEMM
+        auto b_lds_gemm_window = make_tile_window(
+            b_lds_block, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {0, 0});
+
+        // Block GEMM
+        auto block_gemm = BlockGemm();
+
+        // Acc register tile
+        auto c_block_tile = decltype(block_gemm(a_lds_gemm_window, b_lds_gemm_window)){};
+
+        using ABlockTileDistr = decltype(a_copy_dram_window.get_tile_distribution());
+        using BBlockTileDistr = decltype(b_copy_dram_window.get_tile_distribution());
+
+        using ABlockTile = decltype(make_static_distributed_tensor<ADataType>(ABlockTileDistr{}));
+        using BBlockTile = decltype(make_static_distributed_tensor<BDataType>(BBlockTileDistr{}));
+
+        ABlockTile a_block_tile;
+        BBlockTile b_block_tile;
+        using ADramTileWindowStep = typename ADramBlockWindowTmp::BottomTensorIndex;
+        using BDramTileWindowStep = typename BDramBlockWindowTmp::BottomTensorIndex;
+        constexpr ADramTileWindowStep a_dram_tile_window_step = make_array(0, KPerBlock);
+        constexpr BDramTileWindowStep b_dram_tile_window_step = make_array(0, KPerBlock);
+
+        // -------------------------------------------------------------------------------------
+        // Gemm pipeline start
+
+        // Initialize C
+        tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile);
+        // non-prefetch
+        index_t iCounter = num_loop;
+
+        while(iCounter > 0)
+        {
+            a_block_tile = load_tile(a_copy_dram_window); // from DRAM to registers
+            b_block_tile = load_tile(b_copy_dram_window); // from DRAM to registers
+            move_tile_window(a_copy_dram_window, a_dram_tile_window_step);
+            move_tile_window(b_copy_dram_window, b_dram_tile_window_step);
+            store_tile(a_copy_lds_window, a_block_tile); // from registers to LDS
+            store_tile(b_copy_lds_window, b_block_tile); // from registers to LDS
+
+            block_sync_lds();
+            block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); // from LDS to registers
+            block_sync_lds();
+
+            iCounter--;
+        }
+
+        return c_block_tile;
+    }
+};
+
+} // namespace ck_tile
diff --git a/tutorial/ck_tile/01_naive_gemm/block_level/practice_gemm_block_policy_agmem_bgmem_creg.hpp b/tutorial/ck_tile/01_naive_gemm/block_level/practice_gemm_block_policy_agmem_bgmem_creg.hpp
new file mode 100644
index 0000000000..99c4379ad8
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/block_level/practice_gemm_block_policy_agmem_bgmem_creg.hpp
@@ -0,0 +1,135 @@
+#pragma once
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+
+#include "../warp_level/practice_gemm_warp_policy_asmem_bsmem_creg.hpp"
+#include "../warp_level/practice_gemm_warp_pipeline_asmem_bsmem_creg.hpp"
+
+namespace ck_tile {
+
+template <typename ADataType_,
+          typename BDataType_,
+          typename CDataType_,
+          typename AccDataType_,
+          typename Shape_>
+struct PracticeGemmBlockPipelineProblem
+{
+    using ADataType   = ADataType_;
+    using BDataType   = BDataType_;
+    using CDataType   = CDataType_;
+    using AccDataType = AccDataType_;
+    using Shape       = Shape_;
+};
+
+struct PracticeGemmBlockPolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetPracticeWaveGemmPipeline()
+    {
+        return PracticeGemmWarpPipelineASmemBSmemCreg<Problem>{};
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeALdsBlockDescriptor()
+    {
+        constexpr index_t kMPerBlock = Problem::Shape::BlockTile::at(number<0>{});
+        constexpr index_t kKPerBlock = Problem::Shape::BlockTile::at(number<2>{});
+        constexpr index_t kKPack     = 8;
+
+        constexpr auto a_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kMPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
+            make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
+            number<kKPack>{},
+            number<1>{});
+
+        constexpr auto a_lds_block_desc = transform_tensor_descriptor(
+            a_lds_block_desc_0,
+            make_tuple(make_pass_through_transform(kMPerBlock),
+                       make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+        return a_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBLdsBlockDescriptor()
+    {
+        constexpr index_t kNPerBlock = Problem::Shape::BlockTile::at(number<1>{});
+        constexpr index_t kKPerBlock = Problem::Shape::BlockTile::at(number<2>{});
+        constexpr index_t kKPack     = 8;
+
+        constexpr auto b_lds_block_desc_0 = make_naive_tensor_descriptor(
+            make_tuple(number<kNPerBlock>{}, number<kKPerBlock / kKPack>{}, number<kKPack>{}),
+            make_tuple(number<kKPerBlock>{}, number<kKPack>{}, number<1>{}),
+            number<kKPack>{},
+            number<1>{});
+
+        constexpr auto b_lds_block_desc = transform_tensor_descriptor(
+            b_lds_block_desc_0,
+            make_tuple(make_pass_through_transform(kNPerBlock),
+                       make_merge_transform(make_tuple(kKPerBlock / kKPack, kKPack))),
+            make_tuple(sequence<0>{}, sequence<1, 2>{}),
+            make_tuple(sequence<0>{}, sequence<1>{}));
+
+        return b_lds_block_desc;
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeADramTileDistribution()
+    {
+        using ADataType          = remove_cvref_t<typename Problem::ADataType>;
+        using BlockGemm          = remove_cvref_t<decltype(GetPracticeWaveGemmPipeline<Problem>())>;
+        constexpr index_t kMWarp = BlockGemm::MWarp;
+        constexpr index_t kNWarp = BlockGemm::NWarp;
+        constexpr index_t kBlockSize = kMWarp * kNWarp * get_warp_size();
+
+        constexpr index_t kMPerBlock = Problem::Shape::BlockTile::at(number<0>{});
+        constexpr index_t kKPerBlock = Problem::Shape::BlockTile::at(number<2>{});
+
+        constexpr index_t K1 = 16 / sizeof(ADataType);
+        constexpr index_t K0 = kKPerBlock / K1;
+        constexpr index_t M2 = get_warp_size() / K0;
+        // coalesce reading for each blocks
+        constexpr index_t M1 = kBlockSize / get_warp_size();
+        constexpr index_t M0 = kMPerBlock / (M2 * M1);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<M0, M1, M2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBDramTileDistribution()
+    {
+        using BDataType          = remove_cvref_t<typename Problem::BDataType>;
+        using BlockGemm          = remove_cvref_t<decltype(GetPracticeWaveGemmPipeline<Problem>())>;
+        constexpr index_t kMWarp = BlockGemm::MWarp;
+        constexpr index_t kNWarp = BlockGemm::NWarp;
+        constexpr index_t kBlockSize = kMWarp * kNWarp * get_warp_size();
+
+        constexpr index_t kNPerBlock = Problem::Shape::BlockTile::at(number<1>{});
+        constexpr index_t kKPerBlock = Problem::Shape::BlockTile::at(number<2>{});
+
+        constexpr index_t K1 = 16 / sizeof(BDataType);
+        constexpr index_t K0 = kKPerBlock / K1;
+        constexpr index_t N2 = get_warp_size() / K0;
+        // coalesce reading for each blocks
+        constexpr index_t N1 = kBlockSize / get_warp_size();
+        constexpr index_t N0 = kNPerBlock / (N2 * N1);
+
+        return make_static_tile_distribution(
+            tile_distribution_encoding<sequence<1>,
+                                       tuple<sequence<N0, N1, N2>, sequence<K0, K1>>,
+                                       tuple<sequence<1>, sequence<1, 2>>,
+                                       tuple<sequence<1>, sequence<2, 0>>,
+                                       sequence<1, 2>,
+                                       sequence<0, 1>>{});
+    }
+};
+
+} // namespace ck_tile
diff --git a/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_pipeline_agmem_bgmem_creg.hpp b/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_pipeline_agmem_bgmem_creg.hpp
new file mode 100644
index 0000000000..ef12634e42
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_pipeline_agmem_bgmem_creg.hpp
@@ -0,0 +1,92 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+namespace ck_tile {
+template <typename Problem_, typename Policy_ = PracticeGemmHostPolicy>
+struct PracticeGemmHostPipeline
+{
+    using ADataType   = typename Problem_::ADataType;
+    using BDataType   = typename Problem_::BDataType;
+    using CDataType   = typename Problem_::CDataType;
+    using AccDataType = typename Problem_::AccDataType;
+
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    using BlockTile = typename Problem::Shape::BlockTile;
+    using WaveTile  = typename Problem::Shape::WaveTile;
+
+    template <typename ADRAMTensorView, typename BDRAMTensorView, typename CDRAMTensorView>
+    CK_TILE_DEVICE void operator()(const ADRAMTensorView& a_dram,
+                                   const BDRAMTensorView& b_dram,
+                                   CDRAMTensorView& c_dram_ref) const
+    {
+
+        // Size of the entire problem
+        const auto M = a_dram.get_tensor_descriptor().get_length(number<0>{}); // M x K
+        const auto N = c_dram.get_tensor_descriptor().get_length(number<1>{}); // M x N
+        const auto K = a_dram.get_tensor_descriptor().get_length(number<1>{}); // M x K
+
+        // Size of the block tile
+        const auto MPerBlock = BlockTile::at(number<0>{});
+        const auto NPerBlock = BlockTile::at(number<1>{});
+        const auto KPerBlock = BlockTile::at(number<2>{});
+
+        // Number of block tile in the N direction to cover C (resultant) matrix
+        const auto num_tile_n = integer_divide_ceil(N, NPerBlock);
+        // Number of block tile in the M direction to cover C (resultant) matrix
+        const auto num_tile_m = integer_divide_ceil(M, MPerBlock);
+
+        // if(get_thread_id() == 0 && get_block_id() == 0)
+        // {
+        //     printf("num_tile_m: %d, num_tile_n: %d\n", num_tile_m, num_tile_n);
+        //     printf("total number of tiles: %d\n", num_tile_m * num_tile_n);
+        // }
+
+        // Get block id
+        const auto id_block =
+            get_block_id(); // 0 to (M_block/BlockTile_M) * (N_block/BlockTile_N) - 1
+
+        // Map block id to tile id
+        const auto block2tile = Policy::MakeBlock2TileMap(num_tile_m, num_tile_n);
+
+        const auto tile_id = block2tile(id_block);
+
+        const auto tile_id_m = tile_id.at(number<0>{});
+        const auto tile_id_n = tile_id.at(number<1>{});
+
+        // if(get_thread_id() == 0 && get_block_id() == 15)
+        // {
+        //     printf("tile_id_m: %d, tile_id_n: %d\n", tile_id_m, tile_id_n);
+        // }
+
+        const auto tile_origin_m = tile_id_m * MPerBlock;
+        const auto tile_origin_n = tile_id_n * NPerBlock;
+
+        // create a tile window over dram for A and B
+        const auto a_block_window = make_tile_window(
+            a_dram, make_tuple(number<MPerBlock>{}, number<KPerBlock>{}), {tile_origin_m, 0});
+
+        const auto b_block_window = make_tile_window(
+            b_dram, make_tuple(number<NPerBlock>{}, number<KPerBlock>{}), {tile_origin_n, 0});
+
+        constexpr auto block_gemm_pipeline =
+            Policy::template GetPracticeGemmBlockPipeline<Problem>();
+
+        int num_loops_k = integer_divide_ceil(K, KPerBlock);
+
+        __shared__ char p_smem_char[block_gemm_pipeline.GetStaticLDSSize()];
+        const auto c_block_tile =
+            block_gemm_pipeline(a_block_window, b_block_window, num_loops_k, p_smem_char);
+        auto c_window = make_tile_window(c_dram,
+                                         make_tuple(number<MPerBlock>{}, number<NPerBlock>{}),
+                                         {tile_origin_m, tile_origin_n});
+        store_tile(c_window, c_block_tile);
+    }
+};
+} // namespace ck_tile
diff --git a/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_policy_agmem_bgmem_creg.hpp b/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_policy_agmem_bgmem_creg.hpp
new file mode 100644
index 0000000000..d66c3c8522
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/host_level/practice_gemm_host_policy_agmem_bgmem_creg.hpp
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/core.hpp"
+
+#include "../block_level/practice_gemm_block_policy_agmem_bgmem_creg.hpp"
+#include "../block_level/practice_gemm_block_pipeline_agmem_bgmem_creg.hpp"
+
+namespace ck_tile {
+
+template <typename ADataType_,
+          typename BDataType_,
+          typename CDataType_,
+          typename AccDataType_,
+          typename Shape_>
+struct PracticeGemmHostProblem
+{
+    using ADataType   = ADataType_;
+    using BDataType   = BDataType_;
+    using CDataType   = CDataType_;
+    using AccDataType = AccDataType_;
+    using Shape       = remove_cvref_t<Shape_>;
+};
+
+struct PracticeGemmHostPolicy
+{
+    CK_TILE_HOST_DEVICE static constexpr auto MakeBlock2TileMap(index_t M0, index_t N0)
+    {
+        const auto unmerge = make_merge_transform(make_tuple(N0, M0));
+
+        return [unmerge](index_t block_id) {
+            multi_index<2> unmerged;
+            unmerge.calculate_lower_index(unmerged, make_multi_index(block_id));
+
+            return make_multi_index(unmerged.at(number<1>{}), unmerged.at(number<0>{}));
+        };
+    }
+
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetPracticeGemmBlockPipeline()
+    {
+        using PracticeGemmBlockPipelineProblem_ =
+            PracticeGemmBlockPipelineProblem<typename Problem::ADataType,
+                                             typename Problem::BDataType,
+                                             typename Problem::CDataType,
+                                             typename Problem::AccDataType,
+                                             typename Problem::Shape>;
+        return PracticeGemmBlockPipelineAGmemBGmemCreg<PracticeGemmBlockPipelineProblem_>{};
+    }
+};
+} // namespace ck_tile
diff --git a/tutorial/ck_tile/01_naive_gemm/practice_gemm.cpp b/tutorial/ck_tile/01_naive_gemm/practice_gemm.cpp
new file mode 100644
index 0000000000..ee2e125e24
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/practice_gemm.cpp
@@ -0,0 +1,131 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <iostream>
+#include "ck_tile/host.hpp"
+#include "practice_gemm.hpp"
+#include "reference_gemm.hpp"
+
+int main()
+{
+    // TODO: GemmTypeConfig
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::half_t;
+    using CDataType   = float;
+    using AccDataType = float;
+
+    // ArgParser
+    ck_tile::index_t M            = 512;
+    ck_tile::index_t N            = 256;
+    ck_tile::index_t K            = 64;
+    ck_tile::index_t verification = 1;
+
+    ck_tile::index_t stride_a = K;
+    ck_tile::index_t stride_b = K;
+    ck_tile::index_t stride_c = N;
+
+    auto a_lengths = std::array<ck_tile::index_t, 2>{M, K};
+    auto b_lengths = std::array<ck_tile::index_t, 2>{N, K};
+    auto c_lengths = std::array<ck_tile::index_t, 2>{M, N};
+
+    auto a_strides = std::array<ck_tile::index_t, 2>{stride_a, 1};
+    auto b_strides = std::array<ck_tile::index_t, 2>{stride_b, 1};
+    auto c_strides = std::array<ck_tile::index_t, 2>{stride_c, 1};
+
+    // tensors on host (cpu)
+    ck_tile::HostTensor<ADataType> a_host(a_lengths, a_strides);
+    ck_tile::HostTensor<BDataType> b_host(b_lengths, b_strides);
+    ck_tile::HostTensor<CDataType> c_host(c_lengths, c_strides);
+
+    // initialize tensors
+    ck_tile::FillUniformDistribution<ADataType>{-5.f, 5.f}(a_host);
+    ck_tile::FillUniformDistribution<BDataType>{-5.f, 5.f}(b_host);
+    c_host.SetZero();
+
+    // Print the tensors using the new print_first_n member function
+    // std::cout << "Tensor A (first 10 elements): ";
+    // a_host.print_first_n(10);
+    // std::cout << std::endl;
+
+    // std::cout << "Tensor B (first 10 elements): ";
+    // b_host.print_first_n(10);
+    // std::cout << std::endl;
+
+    // std::cout << "Tensor C (first 10 elements): ";
+    // c_host.print_first_n(10);
+    // std::cout << std::endl;
+
+    // Create device tensors of same size as host tensors and copy data
+    ck_tile::DeviceMem a_device(a_host);
+    ck_tile::DeviceMem b_device(b_host);
+    ck_tile::DeviceMem c_device(c_host);
+
+    // TODO: BlockTileConfig
+    // constexpr ck_tile::index_t warpSize    = 64;
+    constexpr ck_tile::index_t kBlockSize = 256;
+
+    using BlockTile = ck_tile::sequence<256, 128, 32>;
+    using WaveTile  = ck_tile::sequence<16, 16, 16>;
+
+    std::cout << "Creating PracticeGemmShape, PracticeGemmProblem, PracticeGemmPolicy" << std::endl;
+    using PracticeGemmShape = ck_tile::PracticeGemmShape<BlockTile, WaveTile>;
+    std::cout << "PracticeGemmShape: " << PracticeGemmShape::GetName() << std::endl;
+    using PracticeGemmHostProblem = ck_tile::
+        PracticeGemmHostProblem<ADataType, BDataType, CDataType, AccDataType, PracticeGemmShape>;
+    using PracticeGemmHostPolicy = ck_tile::PracticeGemmHostPolicy;
+
+    ck_tile::index_t kGridSize = ck_tile::integer_divide_ceil(M, PracticeGemmShape::BlockTile_M) *
+                                 ck_tile::integer_divide_ceil(N, PracticeGemmShape::BlockTile_N);
+
+    std::cout << "kGridSize: " << kGridSize << std::endl;
+    constexpr ck_tile::index_t kBlockPerCU = 1; // 1 block per CU
+
+    std::cout << "kBlockSize: " << kBlockSize << std::endl;
+    std::cout << "kBlockPerCU: " << kBlockPerCU << std::endl;
+
+    using gemm_kernel =
+        ck_tile::PracticeGemmKernel<PracticeGemmHostProblem, PracticeGemmHostPolicy>;
+
+    float ave_time = ck_tile::launch_kernel(
+        ck_tile::stream_config{nullptr, true, 0, 0, 1},
+        ck_tile::make_kernel<kBlockPerCU>(gemm_kernel{},
+                                          kGridSize,
+                                          kBlockSize,
+                                          0,
+                                          static_cast<ADataType*>(a_device.GetDeviceBuffer()),
+                                          static_cast<BDataType*>(b_device.GetDeviceBuffer()),
+                                          static_cast<CDataType*>(c_device.GetDeviceBuffer()),
+                                          M,
+                                          N,
+                                          K,
+                                          stride_a,
+                                          stride_b,
+                                          stride_c));
+
+    auto pass = true;
+
+    if(verification)
+    {
+        // reference gemm
+        ck_tile::HostTensor<CDataType> c_host_ref(c_lengths, c_strides);
+        reference_basic_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_host, b_host, c_host_ref);
+        ck_tile::HostTensor<CDataType> c_host_dev(c_lengths, c_strides);
+        c_device.FromDevice(c_host_dev.mData.data());
+        pass &= ck_tile::check_err(c_host_dev, c_host_ref, "Error: Incorrect results!", 1e-3, 1e-3);
+        std::cout << "valid:" << (pass ? "y" : "n") << std::endl;
+    }
+
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+
+    return !pass;
+}
diff --git a/tutorial/ck_tile/01_naive_gemm/practice_gemm.hpp b/tutorial/ck_tile/01_naive_gemm/practice_gemm.hpp
new file mode 100644
index 0000000000..88879ee221
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/practice_gemm.hpp
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <string>
+#include "ck_tile/core.hpp"
+#include "host_level/practice_gemm_host_policy_agmem_bgmem_creg.hpp"
+#include "host_level/practice_gemm_host_pipeline_agmem_bgmem_creg.hpp"
+
+namespace ck_tile {
+
+template <typename BlockTile_, typename WaveTile_>
+struct PracticeGemmShape
+{
+    using BlockTile = remove_cvref_t<BlockTile_>;
+    using WaveTile  = remove_cvref_t<WaveTile_>;
+
+    static constexpr index_t BlockTile_M = BlockTile::at(number<0>{});
+    static constexpr index_t BlockTile_N = BlockTile::at(number<1>{});
+    static constexpr index_t BlockTile_K = BlockTile::at(number<2>{});
+
+    static constexpr index_t WaveTile_M = WaveTile::at(number<0>{});
+    static constexpr index_t WaveTile_N = WaveTile::at(number<1>{});
+    static constexpr index_t WaveTile_K = WaveTile::at(number<2>{});
+
+    CK_TILE_HOST static std::string GetName()
+    {
+        // clang-format off
+        return concat('_', "practice_gemm_shape",
+                      concat('x', BlockTile_M, BlockTile_N, BlockTile_K),
+                      concat('x', WaveTile_M, WaveTile_N, WaveTile_K));
+        // clang-format on
+    }
+};
+
+template <typename Problem_, typename Policy_>
+struct PracticeGemmKernel
+{
+    using Problem = remove_cvref_t<Problem_>;
+    using Policy  = remove_cvref_t<Policy_>;
+
+    static constexpr index_t kBlockSize = 256;
+
+    CK_TILE_DEVICE void operator()(const typename Problem::ADataType* p_a,
+                                   const typename Problem::BDataType* p_b,
+                                   typename Problem::CDataType* p_c,
+                                   const index_t M,
+                                   const index_t N,
+                                   const index_t K,
+                                   const index_t stride_a,
+                                   const index_t stride_b,
+                                   const index_t stride_c) const
+    {
+
+        auto a_dram = make_naive_tensor_view<address_space_enum::global>(
+            p_a, make_tuple(M, K), make_tuple(stride_a, 1), number<8>{}, number<1>{});
+
+        auto b_dram = make_naive_tensor_view<address_space_enum::global>(
+            p_b, make_tuple(N, K), make_tuple(stride_b, 1), number<8>{}, number<1>{});
+
+        const auto c_dram = make_naive_tensor_view<address_space_enum::global>(
+            p_c, make_tuple(M, N), make_tuple(stride_c, 1), number<8>{}, number<1>{});
+
+        PracticeGemmHostPipeline<Problem, Policy>{}(a_dram, b_dram, c_dram);
+    }
+};
+
+} // namespace ck_tile
diff --git a/tutorial/ck_tile/01_naive_gemm/reference_gemm.hpp b/tutorial/ck_tile/01_naive_gemm/reference_gemm.hpp
new file mode 100644
index 0000000000..8f975be7dc
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/reference_gemm.hpp
@@ -0,0 +1,36 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host/host_tensor.hpp"
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+void reference_basic_gemm(const ck_tile::HostTensor<ADataType>& a_m_k,
+                          const ck_tile::HostTensor<BDataType>& b_n_k,
+                          ck_tile::HostTensor<CDataType>& c_m_n)
+{
+    const int N = b_n_k.mDesc.get_lengths()[0];
+    const int K = b_n_k.mDesc.get_lengths()[1];
+
+    auto f = [&](auto m) {
+        for(int n = 0; n < N; ++n)
+        {
+            AccDataType v_acc = 0;
+
+            for(int k = 0; k < K; ++k)
+            {
+                ADataType v_a = a_m_k(m, k);
+                BDataType v_b = b_n_k(n, k);
+
+                v_acc += ck_tile::type_convert<AccDataType>(v_a) *
+                         ck_tile::type_convert<AccDataType>(v_b);
+            }
+
+            c_m_n(m, n) = ck_tile::type_convert<CDataType>(v_acc);
+        }
+    };
+
+    ck_tile::make_ParallelTensorFunctor(f, c_m_n.mDesc.get_lengths()[0])(1);
+}
diff --git a/tutorial/ck_tile/01_naive_gemm/warp_level/practice_gemm_warp_pipeline_asmem_bsmem_creg.hpp b/tutorial/ck_tile/01_naive_gemm/warp_level/practice_gemm_warp_pipeline_asmem_bsmem_creg.hpp
new file mode 100644
index 0000000000..bf058af9c5
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/warp_level/practice_gemm_warp_pipeline_asmem_bsmem_creg.hpp
@@ -0,0 +1,195 @@
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/host.hpp"
+
+namespace ck_tile {
+
+template <typename Problem, typename Policy = PracticeGemmWarpPolicy>
+struct PracticeGemmWarpPipelineASmemBSmemCreg
+{
+
+    using ADataType     = remove_cvref_t<typename Problem::ADataType>;
+    using BDataType     = remove_cvref_t<typename Problem::BDataType>;
+    using CDataType     = remove_cvref_t<typename Problem::CDataType>;
+    using WaveGemmShape = remove_cvref_t<typename Problem::Shape>;
+
+    using WarpGemm = remove_cvref_t<
+        decltype(Policy::template GetWarpGemmMWarpNWarp<Problem>().template get<0>())>;
+    static constexpr index_t MWarp =
+        Policy::template GetWarpGemmMWarpNWarp<Problem>().template get<1>();
+    static constexpr index_t NWarp =
+        Policy::template GetWarpGemmMWarpNWarp<Problem>().template get<2>();
+
+    using AWarpDstr = typename WarpGemm::AWarpDstr;
+    using BWarpDstr = typename WarpGemm::BWarpDstr;
+    using CWarpDstr = typename WarpGemm::CWarpDstr;
+
+    using AWarpTensor = typename WarpGemm::AWarpTensor;
+    using BWarpTensor = typename WarpGemm::BWarpTensor;
+    using CWarpTensor = typename WarpGemm::CWarpTensor;
+
+    static constexpr auto a_warp_y_lengths =
+        to_sequence(AWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+    static constexpr auto b_warp_y_lengths =
+        to_sequence(BWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+    static constexpr auto c_warp_y_lengths =
+        to_sequence(CWarpDstr{}.get_ys_to_d_descriptor().get_lengths());
+
+    static constexpr auto a_warp_y_index_zeros = uniform_sequence_gen_t<AWarpDstr::NDimY, 0>{};
+    static constexpr auto b_warp_y_index_zeros = uniform_sequence_gen_t<BWarpDstr::NDimY, 0>{};
+    static constexpr auto c_warp_y_index_zeros = uniform_sequence_gen_t<CWarpDstr::NDimY, 0>{};
+
+    // C += A * B
+    template <typename CBlockTensor, typename ABlockWindowTmp, typename BBlockWindowTmp>
+    CK_TILE_DEVICE void operator()(CBlockTensor& c_block_tensor,
+                                   [[maybe_unused]] const ABlockWindowTmp& a_block_window_tmp,
+                                   [[maybe_unused]] const BBlockWindowTmp& b_block_window_tmp) const
+    {
+        static_assert(std::is_same_v<ADataType, typename ABlockWindowTmp::DataType> &&
+                          std::is_same_v<BDataType, typename BBlockWindowTmp::DataType> &&
+                          std::is_same_v<CDataType, typename CBlockTensor::DataType>,
+                      "wrong!");
+
+        constexpr index_t MPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<0>{}];
+        constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
+        constexpr index_t KPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<1>{}];
+
+        static_assert(MPerBlock == WaveGemmShape::BlockTile_M &&
+                          NPerBlock == WaveGemmShape::BlockTile_N &&
+                          KPerBlock == WaveGemmShape::BlockTile_K,
+                      "wrong!");
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
+        constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
+
+#if !defined(ENABLE_PREFETCH)
+        constexpr index_t MPerBlockPerIter = MPerBlock / MIterPerWarp;
+        constexpr index_t NPerBlockPerIter = NPerBlock / NIterPerWarp;
+        constexpr index_t KPerBlockPerIter = KPerBlock / KIterPerWarp;
+
+        const index_t iMWarp = get_warp_id() / NWarp;
+        const index_t iNWarp = get_warp_id() % NWarp;
+
+        // Construct A-warp-window
+        auto a_warp_window_tmp = make_tile_window(
+            a_block_window_tmp.get_bottom_tensor_view(),
+            make_tuple(number<WarpGemm::kM>{}, number<WarpGemm::kK>{}),
+            {a_block_window_tmp.get_window_origin().at(number<0>{}) + iMWarp * WarpGemm::kM,
+             a_block_window_tmp.get_window_origin().at(number<1>{})},
+            make_static_tile_distribution(typename WarpGemm::AWarpDstrEncoding{}));
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(a_warp_window_tmp), KIterPerWarp>,
+            MIterPerWarp>
+            a_warp_windows;
+
+        static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                a_warp_windows(mIter)(kIter) = a_warp_window_tmp;
+                move_tile_window(a_warp_windows(mIter)(kIter),
+                                 {mIter * MPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+
+        // Construct B-warp-window
+        auto b_warp_window_tmp = make_tile_window(
+            b_block_window_tmp.get_bottom_tensor_view(),
+            make_tuple(number<WarpGemm::kN>{}, number<WarpGemm::kK>{}),
+            {b_block_window_tmp.get_window_origin().at(number<0>{}) + iNWarp * WarpGemm::kN,
+             b_block_window_tmp.get_window_origin().at(number<1>{})},
+            make_static_tile_distribution(typename WarpGemm::BWarpDstrEncoding{}));
+
+        statically_indexed_array<
+            statically_indexed_array<decltype(b_warp_window_tmp), KIterPerWarp>,
+            NIterPerWarp>
+            b_warp_windows;
+
+        static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+            static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+                b_warp_windows(nIter)(kIter) = b_warp_window_tmp;
+                move_tile_window(b_warp_windows(nIter)(kIter),
+                                 {nIter * NPerBlockPerIter, kIter * KPerBlockPerIter});
+            });
+        });
+#endif
+
+        // hot loop:
+        static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
+            static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
+                // Read A warp tensor from A block tensor
+                AWarpTensor a_warp_tensor;
+
+                a_warp_tensor = load_tile(a_warp_windows(mIter)(kIter));
+
+                static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
+                    // Read B warp tensor from B block tensor
+                    BWarpTensor b_warp_tensor;
+
+                    b_warp_tensor = load_tile(b_warp_windows(nIter)(kIter));
+
+                    // Read C warp tensor from C block tensor
+                    CWarpTensor c_warp_tensor;
+
+                    c_warp_tensor.get_thread_buffer() = c_block_tensor.get_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths));
+
+                    // Warp GEMM
+                    WarpGemm{}(c_warp_tensor, a_warp_tensor, b_warp_tensor);
+
+                    // Write C warp tensor into C block tensor
+                    c_block_tensor.set_y_sliced_thread_data(
+                        merge_sequences(sequence<mIter, nIter>{}, c_warp_y_index_zeros),
+                        merge_sequences(sequence<1, 1>{}, c_warp_y_lengths),
+                        c_warp_tensor.get_thread_buffer());
+                });
+            });
+        });
+    }
+
+    // C = A * B
+    template <typename ABlockWindowTmp, typename BBlockWindowTmp>
+    CK_TILE_DEVICE auto operator()([[maybe_unused]] const ABlockWindowTmp& a_block_window_tmp,
+                                   [[maybe_unused]] const BBlockWindowTmp& b_block_window_tmp) const
+    {
+        static_assert(std::is_same_v<ADataType, typename ABlockWindowTmp::DataType> &&
+                          std::is_same_v<BDataType, typename BBlockWindowTmp::DataType>,
+                      "wrong!");
+
+        constexpr index_t MPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<0>{}];
+        constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
+        constexpr index_t KPerBlock = ABlockWindowTmp{}.get_window_lengths()[number<1>{}];
+
+        static_assert(MPerBlock == WaveGemmShape::BlockTile_M &&
+                          NPerBlock == WaveGemmShape::BlockTile_N &&
+                          KPerBlock == WaveGemmShape::BlockTile_K,
+                      "wrong!");
+
+        constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WarpGemm::kM);
+        constexpr index_t NIterPerWarp = NPerBlock / (NWarp * WarpGemm::kN);
+
+        static_assert(std::is_same_v<CDataType, typename WarpGemm::CDataType>, "wrong!");
+
+        // Construct C-Block-Tensor
+        constexpr auto c_block_outer_dstr_encoding = tile_distribution_encoding<
+            sequence<>,
+            tuple<sequence<MIterPerWarp, MWarp>, sequence<NIterPerWarp, NWarp>>,
+            tuple<sequence<1, 2>>,
+            tuple<sequence<1, 1>>,
+            sequence<1, 2>,
+            sequence<0, 0>>{};
+
+        constexpr auto c_block_dstr_encode = detail::make_embed_tile_distribution_encoding(
+            c_block_outer_dstr_encoding, typename WarpGemm::CWarpDstrEncoding{});
+
+        constexpr auto c_block_dstr = make_static_tile_distribution(c_block_dstr_encode);
+
+        auto c_block_tensor = make_static_distributed_tensor<CDataType>(c_block_dstr);
+
+        return c_block_tensor;
+    }
+};
+
+} // namespace ck_tile
diff --git a/tutorial/ck_tile/01_naive_gemm/warp_level/practice_gemm_warp_policy_asmem_bsmem_creg.hpp b/tutorial/ck_tile/01_naive_gemm/warp_level/practice_gemm_warp_policy_asmem_bsmem_creg.hpp
new file mode 100644
index 0000000000..2efa2bcc2a
--- /dev/null
+++ b/tutorial/ck_tile/01_naive_gemm/warp_level/practice_gemm_warp_policy_asmem_bsmem_creg.hpp
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck_tile/core.hpp"
+#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
+
+namespace ck_tile {
+
+// Default policy for BlockGemmASmemBSmemCReg
+// Default policy class should not be templated, put template on member functions instead
+struct PracticeGemmWarpPolicy
+{
+    template <typename Problem>
+    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
+    {
+        constexpr index_t kMWarp = 4;
+        constexpr index_t kNWarp = 1;
+
+        if constexpr(std::is_same_v<typename Problem::ADataType, half_t> &&
+                     std::is_same_v<typename Problem::BDataType, half_t> &&
+                     std::is_same_v<typename Problem::CDataType, float>)
+        {
+            return make_tuple(
+                WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, kMWarp, kNWarp);
+        }
+        else
+        {
+            static_assert(false, "Unsupported data type configuration for GEMM warp execution.");
+        }
+    }
+};
+
+} // namespace ck_tile
diff --git a/tutorial/ck_tile/CMakeLists.txt b/tutorial/ck_tile/CMakeLists.txt
new file mode 100644
index 0000000000..9895f5a71d
--- /dev/null
+++ b/tutorial/ck_tile/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories(AFTER
+  ${CMAKE_CURRENT_LIST_DIR}
+)
+
+add_subdirectory(00_copy_kernel)
+add_subdirectory(01_naive_gemm)
+