From 7ca2d03e826385d4d6930bcd23ca54ddfecacf94 Mon Sep 17 00:00:00 2001
From: coderfeli <coderfeli@163.com>
Date: Tue, 25 Feb 2025 05:08:21 +0000
Subject: [PATCH] merge moe sorting

---
 codegen/CMakeLists.txt                        |    1 -
 codegen/test/batched_gemm_softmax_gemm.cpp    |   87 ++
 codegen/test/gemm_multiple_d.cpp              |  134 +-
 codegen/test/include/common.hpp               |   49 +-
 codegen/test/rtc/CMakeLists.txt               |    6 +
 .../test/rtc/include/rtc/compile_kernel.hpp   |    5 +-
 codegen/test/rtc/src/compile_kernel.cpp       |  203 ++-
 docs/sphinx/requirements.in                   |    2 +-
 docs/sphinx/requirements.txt                  |    2 +-
 example/01_gemm/gemm_xdl_streamk.cpp          |    4 +
 .../20_grouped_conv_bwd_weight/CMakeLists.txt |    6 +
 .../grouped_conv_bwd_weight_v3_xdl_bf16.cpp   |  102 ++
 .../grouped_conv_bwd_weight_v3_xdl_fp16.cpp   |   99 ++
 ..._multiply_multiply_xdl_fp8_bpreshuffle.cpp |   65 +-
 .../67_gemm_microscaling/gemm_mx_common.hpp   |    2 +
 .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py   |   26 +-
 .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py   |   22 +-
 .../01_fmha/codegen/ops/fmha_fwd_appendkv.py  |    4 +-
 .../01_fmha/codegen/ops/fmha_fwd_splitkv.py   |   21 +-
 example/ck_tile/01_fmha/generate.py           |    8 +-
 example/ck_tile/03_gemm/gemm_basic.hpp        |   17 +-
 example/ck_tile/03_gemm/run_gemm_example.inc  |  119 +-
 example/ck_tile/03_gemm/universal_gemm.cpp    |   18 +
 .../ck_tile/13_moe_sorting/moe_sorting.cpp    |  112 +-
 .../13_moe_sorting/moe_sorting_api.cpp        |  104 +-
 .../13_moe_sorting/moe_sorting_api.hpp        |    6 +
 include/ck/ck.hpp                             |    3 +-
 include/ck/host_utility/device_prop.hpp       |    2 +
 include/ck/host_utility/kernel_launch.hpp     |    3 +-
 ..._pipeline_xdlops_b_preshuffle_selector.hpp |    2 +-
 ...e_gemm_pipeline_xdlops_b_preshuffle_v1.hpp |    5 +-
 .../gpu/device/device_base.hpp                |    7 +-
 .../device_batched_gemm_softmax_gemm.hpp      |    5 +-
 .../gpu/device/device_gemm_multiple_d.hpp     |    9 +-
 .../gpu/device/gemm_specialization.hpp        |    3 +-
 ...batched_gemm_softmax_gemm_xdl_cshuffle.hpp |   10 +-
 .../device_gemm_multiple_d_xdl_cshuffle.hpp   |   27 +-
 ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp |  299 +++-
 ...rouped_conv_bwd_weight_xdl_cshuffle_v3.hpp | 1349 +++++++++++++++++
 .../gpu/device/masking_specialization.hpp     |    4 +-
 .../gpu/device/tensor_layout.hpp              |    2 +-
 .../element/unary_element_wise_operation.hpp  |    3 +-
 .../gpu/grid/block_to_ctile_map.hpp           |    2 +-
 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp |    5 +-
 .../grid/gridwise_gemm_pipeline_selector.hpp  |    7 +-
 .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp |   68 +-
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    |    0
 .../tensor_operation/gpu/warp/xdlops_gemm.hpp |   21 +-
 .../transform_conv_bwd_weight_to_gemm_v2.hpp  |  271 +++-
 include/ck/utility/amd_buffer_addressing.hpp  |    2 +
 .../ck/utility/amd_wave_read_first_lane.hpp   |    2 +-
 include/ck/utility/amd_xdlops.hpp             |   10 +-
 include/ck/utility/data_type.hpp              |  128 +-
 include/ck/utility/enable_if.hpp              |   18 +-
 include/ck/utility/loop_scheduler.hpp         |    8 +-
 include/ck/utility/magic_division.hpp         |    3 +-
 include/ck/utility/math_v2.hpp                |   20 +-
 include/ck/utility/sequence.hpp               |    4 +-
 include/ck/utility/tuple_helper.hpp           |    4 +-
 include/ck/utility/type.hpp                   |  629 ++++----
 include/ck/utility/type_convert.hpp           |    4 +-
 .../core/arch/amd_buffer_addressing.hpp       |    6 +-
 include/ck_tile/core/container/array.hpp      |    6 +-
 .../ck_tile/core/container/thread_buffer.hpp  |   13 +-
 include/ck_tile/core/container/tuple.hpp      |    4 +-
 include/ck_tile/core/numeric/bfloat16.hpp     |    8 +-
 include/ck_tile/core/numeric/float8.hpp       |    5 +-
 include/ck_tile/core/numeric/half.hpp         |   24 +-
 include/ck_tile/core/numeric/int8.hpp         |    5 +-
 include/ck_tile/core/numeric/numeric.hpp      |    6 +-
 include/ck_tile/core/numeric/pk_int4.hpp      |   14 +-
 include/ck_tile/core/numeric/vector_type.hpp  |   66 +-
 include/ck_tile/core/tensor/buffer_view.hpp   |  119 +-
 .../core/tensor/static_distributed_tensor.hpp |   22 +-
 include/ck_tile/core/tensor/tensor_view.hpp   |   65 +-
 include/ck_tile/core/tensor/tile_window.hpp   |   47 +-
 .../core/tensor/tile_window_linear.hpp        |   57 +-
 include/ck_tile/host/check_err.hpp            |   30 +-
 include/ck_tile/host/fill.hpp                 |   11 +-
 include/ck_tile/host/host_tensor.hpp          |   28 +-
 .../ck_tile/host/reference/reference_gemm.hpp |   68 +-
 .../unary_element_wise_operation.hpp          |  156 +-
 .../ck_tile/ops/fmha/block/block_masking.hpp  |    2 +-
 .../fmha/kernel/fmha_fwd_splitkv_kernel.hpp   |    2 +-
 .../fused_moe/kernel/moe_sorting_kernel.hpp   |  824 +++++++++-
 .../fused_moe/kernel/moe_sorting_problem.hpp  |   17 +
 .../block/block_universal_gemm_as_bs_cr.hpp   |  118 +-
 .../gemm_pipeline_ag_bg_cr_comp_v3.hpp        |   21 +-
 .../gemm_pipeline_ag_bg_cr_comp_v4.hpp        |   19 +-
 .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp   |   15 +-
 ...ine_agmem_bgmem_creg_v1_default_policy.hpp |   18 +-
 .../gemm_pipeline_agmem_bgmem_creg_v2.hpp     |   20 +-
 .../gemm/pipeline/gemm_pipeline_problem.hpp   |   45 +-
 ...emm_universal_pipeline_ag_bg_cr_policy.hpp |   40 +-
 .../cpu/reference_fpAintB_gemm.hpp            |   38 +-
 .../cpu/reference_gemm.hpp                    |   29 +-
 .../cpu/reference_gemm_multiple_d.hpp         |   24 +-
 .../device_operation_instance_factory.hpp     |    1 +
 .../tensor_operation_instance/gpu/gemm.hpp    |   44 +-
 .../tensor_operation_instance/gpu/gemm_dl.inc |   30 -
 .../gpu/gemm_multiply_multiply.hpp            |   81 +-
 ...mm_multiply_multiply_weight_preshuffle.hpp |  133 +-
 .../gpu/gemm_universal.hpp                    |   33 +
 .../gpu/gemm_universal_streamk.hpp            |  500 ++++++
 ...conv_bwd_weight_two_stage_xdl_instance.hpp |   78 +-
 ...rouped_conv_bwd_weight_v3_xdl_instance.hpp |  112 ++
 ...e_grouped_conv_bwd_weight_xdl_instance.hpp |  264 ++--
 ...ice_grouped_conv_fwd_xdl_comp_instance.hpp |   19 +-
 ...ed_conv_fwd_xdl_merged_groups_instance.hpp |   17 +-
 .../grouped_convolution_backward_weight.hpp   |  112 +-
 ...rouped_convolution_backward_weight_xdl.inc |  505 +++++-
 .../gpu/grouped_convolution_forward.hpp       |   18 +-
 .../grouped_convolution_forward_comp_xdl.inc  |   16 +
 ...uped_convolution_forward_mem_inter_xdl.inc |   16 +
 ...uped_convolution_forward_mem_intra_xdl.inc |   16 +
 .../gpu/grouped_convolution_forward_xdl.inc   |   16 +
 ..._convolution_forward_xdl_merged_groups.inc |   14 +
 .../gpu/grouped_gemm_fixed_nk.hpp             |   47 +
 .../gpu/CMakeLists.txt                        |   71 +-
 ...m_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp |    5 +-
 ...m_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp |    5 +-
 ...m_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp |   10 +-
 ...m_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp |   10 +-
 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp |    8 +-
 ...f16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp |    4 +-
 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp |    4 +-
 ...f16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp |    4 +-
 ...6_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp |    2 +
 ..._c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp |   20 +-
 ..._bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp |   20 +-
 ...s_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp |   20 +-
 ..._2_stage_f16_f16_f16_mk_nk_mn_instance.cpp |    5 +-
 ...uffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp |    5 +
 ..._shuffle_f16_f16_f16_km_kn_mn_instance.cpp |    5 +-
 ..._shuffle_f16_f16_f16_km_nk_mn_instance.cpp |    5 +-
 ..._shuffle_f16_f16_f16_mk_kn_mn_instance.cpp |    9 +-
 ..._shuffle_f16_f16_f16_mk_nk_mn_instance.cpp |   12 +-
 ...l_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp |    7 +-
 ...m_kn_mn_interwave_pipeline_v1_instance.cpp |    9 +-
 ...regular_interwave_pipeline_v1_instance.cpp |    2 +
 ...m_nk_mn_interwave_pipeline_v1_instance.cpp |    9 +-
 ...regular_interwave_pipeline_v1_instance.cpp |    2 +
 ...k_kn_mn_interwave_pipeline_v1_instance.cpp |    9 +-
 ...regular_interwave_pipeline_v1_instance.cpp |    2 +
 ...k_nk_mn_interwave_pipeline_v1_instance.cpp |    2 +
 ...regular_interwave_pipeline_v1_instance.cpp |    2 +
 ...16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp |   15 +-
 ...16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp |   15 +-
 ...16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp |   15 +-
 ...16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp |   15 +-
 ...e_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp |   15 +-
 ...e_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp |   15 +-
 ...e_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp |   15 +-
 ...e_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp |   15 +-
 ..._layernorm_f16_km_kn_mn_mn_mn_instance.cpp |    8 +-
 ..._layernorm_f16_km_nk_mn_mn_mn_instance.cpp |    8 +-
 ..._layernorm_f16_mk_kn_mn_mn_mn_instance.cpp |    8 +-
 ..._layernorm_f16_mk_nk_mn_mn_mn_instance.cpp |    8 +-
 ..._shuffle_f16_f16_f16_km_kn_mn_instance.cpp |   15 +-
 ..._shuffle_f16_f16_f16_km_nk_mn_instance.cpp |   15 +-
 ..._shuffle_f16_f16_f16_mk_kn_mn_instance.cpp |   15 +-
 ..._shuffle_f16_f16_f16_mk_nk_mn_instance.cpp |   15 +-
 ...tiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp |    8 +
 .../CMakeLists.txt                            |   32 +-
 ..._mfma16x16_mn_compute_default_instance.cpp |   33 +
 ...t_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp |  147 +-
 ...16_mk_mfma_mn_compute_default_instance.cpp |    4 +-
 ...f8_bf16_mk_mfma_mn_p1_default_instance.cpp |    1 +
 ...f8_bf16_mk_mfma_mn_p2_default_instance.cpp |    1 +
 ...f8_bf16_mk_mfma_mn_p3_default_instance.cpp |    1 +
 ..._mfma16x16_mn_compute_default_instance.cpp |   33 +
 ...ht_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp |  147 +-
 ..._f8_f16_mk_mfma_mn_p1_default_instance.cpp |    3 +-
 ..._f8_f16_mk_mfma_mn_p2_default_instance.cpp |    1 +
 ..._f8_f16_mk_mfma_mn_p3_default_instance.cpp |    1 +
 ...f16_f16_mk_kn_mn_v1_interwave_instance.cpp |    4 +-
 ...f16_f16_mk_kn_mn_v1_irregular_instance.cpp |    4 +-
 ...f16_f16_mk_nk_mn_v1_interwave_instance.cpp |    4 +-
 .../gpu/gemm_universal/CMakeLists.txt         |    3 +
 ..._xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp |   10 +-
 ..._xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp |   16 +-
 ..._xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp |   30 +-
 ..._xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp |   32 +-
 ...emm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp |   41 +-
 ...emm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp |   44 +-
 ...gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp |   12 +-
 ...gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp |   12 +-
 ...gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp |    2 +
 ...gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp |    2 +
 ..._xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp |    2 +
 ..._xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp |   12 +-
 ...mm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp |   14 +-
 ...emm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp |   12 +-
 .../gpu/gemm_universal_streamk/CMakeLists.txt |   39 +-
 ...versal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp |    2 +
 ...versal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp |    9 +
 ...universal_streamk_f16_f16_f16_mk_kn_mn.hpp |   16 +-
 ...universal_streamk_f16_f16_f16_mk_nk_mn.hpp |   14 +-
 .../grouped_conv2d_bwd_weight/CMakeLists.txt  |   23 +
 ...hwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp |    6 +-
 ...kyxc_gnhwk_f16_default_pipev1_instance.cpp |   38 +
 ...ght_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp |   27 +-
 ...c_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp |   39 +
 ...kyxc_gnhwk_f32_default_pipev1_instance.cpp |   39 +
 ...ght_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp |   27 +-
 ...c_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp |   39 +
 ...ht_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp |   49 +
 ...ght_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp |   49 +
 ...ght_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp |   49 +
 ...yxc_nhwgk_bf16_default_pipev2_instance.cpp |   39 +
 ...yxc_nhwgk_bf16_default_pipev5_instance.cpp |   39 +
 ..._gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp |   40 +
 ..._gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp |   40 +
 ...kyxc_nhwgk_f16_default_pipev2_instance.cpp |   40 +
 ...kyxc_nhwgk_f16_default_pipev5_instance.cpp |   40 +
 ...c_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp |   39 +
 ...c_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp |   39 +
 ...kyxc_nhwgk_f32_default_pipev2_instance.cpp |   39 +
 ...kyxc_nhwgk_f32_default_pipev5_instance.cpp |   39 +
 ...c_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp |   39 +
 ...c_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp |   39 +
 .../gpu/grouped_conv2d_fwd/CMakeLists.txt     |    5 +
 .../grouped_conv3d_bwd_weight/CMakeLists.txt  |   43 +-
 ...c_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp |    6 +-
 ..._xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp |   27 +-
 ..._xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp |   27 +-
 ...xc_ndhwgk_bf16_default_pipev2_instance.cpp |   39 +
 ...xc_ndhwgk_bf16_default_pipev5_instance.cpp |   39 +
 ...kzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp |   39 +
 ...kzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp |   39 +
 ...yxc_ndhwgk_f16_default_pipev2_instance.cpp |   39 +
 ...yxc_ndhwgk_f16_default_pipev5_instance.cpp |   39 +
 ...gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp |   39 +
 ...gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp |   39 +
 ...yxc_ndhwgk_f32_default_pipev2_instance.cpp |   39 +
 ...yxc_ndhwgk_f32_default_pipev5_instance.cpp |   39 +
 ...gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp |   39 +
 ...gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp |   39 +
 ...xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp |   49 +
 ..._xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp |   49 +
 ..._xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp |   49 +
 ...f16_f8_f16_mk_kn_mn_irregular_instance.cpp |    5 +-
 .../gpu/grouped_gemm_fixed_nk/CMakeLists.txt  |    6 +-
 ...le_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp |    9 +-
 library/src/utility/CMakeLists.txt            |    6 +-
 .../profile_gemm_multiply_multiply_impl.hpp   |    1 -
 .../profiler/profile_gemm_universal_impl.hpp  |  105 +-
 .../profile_grouped_conv_bwd_weight_impl.hpp  |   39 +-
 profiler/src/CMakeLists.txt                   |  308 ++--
 .../src/profile_gemm_multiply_multiply.cpp    |   12 +-
 profiler/src/profile_gemm_universal.cpp       |   17 +-
 .../src/profile_gemm_universal_streamk.cpp    |   45 +-
 .../src/profile_grouped_gemm_fixed_nk.cpp     |  225 +--
 test/grouped_convnd_bwd_weight/CMakeLists.txt |    4 +
 ...ped_convnd_bwd_weight_v3_interface_xdl.cpp |  179 +++
 test/mx_mfma_op/mx_mfma_op.cpp                |   69 +-
 test/mx_mfma_op/mx_mfma_op.hpp                |  924 +++++++++--
 257 files changed, 11031 insertions(+), 1958 deletions(-)
 create mode 100644 codegen/test/batched_gemm_softmax_gemm.cpp
 create mode 100644 example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_v3_xdl_bf16.cpp
 create mode 100644 example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_v3_xdl_fp16.cpp
 create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
 mode change 100644 => 100755 include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
 create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp
 mode change 100644 => 100755 library/src/tensor_operation_instance/gpu/CMakeLists.txt
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
 create mode 100644 library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
 mode change 100755 => 100644 profiler/src/profile_gemm_universal_streamk.cpp
 create mode 100644 test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_v3_interface_xdl.cpp

diff --git a/codegen/CMakeLists.txt b/codegen/CMakeLists.txt
index 45c47672b0..9e7c360f54 100644
--- a/codegen/CMakeLists.txt
+++ b/codegen/CMakeLists.txt
@@ -46,7 +46,6 @@ rocm_install_targets(
     TARGETS ck_host ck_headers
     EXPORT ck_host_targets
     INCLUDE include
-    PRIVATE
 )
 rocm_export_targets(
     EXPORT ck_host_targets
diff --git a/codegen/test/batched_gemm_softmax_gemm.cpp b/codegen/test/batched_gemm_softmax_gemm.cpp
new file mode 100644
index 0000000000..3f0b8bfe6a
--- /dev/null
+++ b/codegen/test/batched_gemm_softmax_gemm.cpp
@@ -0,0 +1,87 @@
+#include "ck/host/device_batched_gemm_softmax_gemm/problem.hpp"
+#include "ck/host/stringutils.hpp"
+#include "ck/host/utils.hpp"
+#include "common.hpp"
+#include <rtc/compile_kernel.hpp>
+#include <rtc/hip.hpp>
+#include <test.hpp>
+#include <cmath>
+
+using half = _Float16;
+
+const std::string gemm_compile_check = R"__ck__(
+#include <${include}>
+
+extern "C" __global__ void f(const ck::half_t* a, const ck::half_t* b, const ck::half_t* b1, ck::half_t* c) {
+    using G = ${template};
+    constexpr auto desc = G::make_descriptor(ck::make_naive_tensor_descriptor(ck::make_tuple(${m}, ${k}), ck::make_tuple(${m}, 1)),
+                                             ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${k}), ck::make_tuple(${n}, 1)),
+                                             ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${o}), ck::make_tuple(1, ${n})),
+                                             ck::make_naive_tensor_descriptor(ck::make_tuple(${m}, ${o}), ck::make_tuple(${m}, 1)));
+
+    static_assert(desc.IsValid(), "Invalid ck gemm.");
+
+    if constexpr(desc.IsValid())
+    {
+        ${template}::Run(desc,
+               1.0,
+               a,
+               b,
+               b1,
+               c);
+    }
+}
+
+)__ck__";
+
+TEST_CASE(test_problem_kernel)
+{
+    ck::host::device_batched_gemm_softmax_gemm::Problem prob;
+    prob.M      = 1024;
+    prob.N      = 1024;
+    prob.K      = 1024;
+    prob.O      = 1024;
+    prob.TransB = true;
+    check_all<half> check1, check2;
+    auto a  = to_gpu(generate_buffer<half>(1024 * 1024, 0));
+    auto b  = to_gpu(generate_buffer<half>(1024 * 1024, 1));
+    auto b1 = to_gpu(generate_buffer<half>(1024 * 1024, 2));
+    auto c  = to_gpu(generate_buffer<half>(1024 * 1024, 3));
+
+    std::string epilogue = "";
+    std::string prologue = "";
+
+    auto solutions = prob.GetSolutions("gfx90a", prologue, epilogue);
+    std::cout << "Num solutions: " << solutions.size() << std::endl;
+    for(auto i = 0; i < solutions.size(); ++i)
+    {
+        std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
+        auto&& solution = solutions[i];
+        auto src        = ck::host::InterpolateString(gemm_compile_check,
+                                               {{"include", prob.GetIncludeHeader()},
+                                                {"template", solution.ToTemplateString()},
+                                                {"m", std::to_string(prob.M)},
+                                                {"n", std::to_string(prob.N)},
+                                                {"k", std::to_string(prob.K)},
+                                                {"o", std::to_string(prob.O)}});
+        auto srcs       = get_headers_for_test();
+        srcs.push_back({"main.cpp", src});
+        rtc::compile_options options;
+        options.kernel_name = "f";
+        auto k              = rtc::compile_kernel(srcs, options);
+        auto block_size     = solution.GetTemplateParameter<std::size_t>("BlockSize");
+        auto m_per_block    = solution.GetTemplateParameter<std::size_t>("Gemm01MPerBlock");
+        auto n_per_block    = solution.GetTemplateParameter<std::size_t>("Gemm1NPerBlock");
+        auto grid_size      = ck::host::integer_divide_ceil(prob.M, m_per_block) *
+                         ck::host::integer_divide_ceil(prob.N, n_per_block);
+        k.launch(nullptr, grid_size * block_size, block_size)(
+            a.data(), b.data(), b1.data(), c.data());
+
+        if(solution.GetTemplateParameter<bool>("MaskOutUpperTriangle"))
+            CHECK(report(solution, check1(rtc::from_gpu(c))));
+        else
+            CHECK(report(solution, check2(rtc::from_gpu(c))));
+    }
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
diff --git a/codegen/test/gemm_multiple_d.cpp b/codegen/test/gemm_multiple_d.cpp
index 9e2d990d9b..2a383fc1c8 100644
--- a/codegen/test/gemm_multiple_d.cpp
+++ b/codegen/test/gemm_multiple_d.cpp
@@ -6,134 +6,24 @@
 #include "ck/host/headers.hpp"
 #include "ck/host/stringutils.hpp"
 #include "ck/host/utils.hpp"
-#include <algorithm>
-#include <cmath>
-#include <iterator>
-#include <random>
-#include <test.hpp>
+#include "common.hpp"
 #include <rtc/compile_kernel.hpp>
 #include <rtc/hip.hpp>
+#include <test.hpp>
+#include <algorithm>
+#include <cmath>
 #include <fstream>
+#include <iterator>
+#include <random>
 
 using half = _Float16;
-// using half = __fp16;
-
-std::vector<rtc::src_file> get_headers_for_test()
-{
-    std::vector<rtc::src_file> result;
-    auto hs = ck::host::GetHeaders();
-    std::transform(
-        hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
-            return {p.first, p.second};
-        });
-    return result;
-}
-
-template <class T>
-rtc::buffer<T> generate_buffer(std::size_t n, std::size_t seed = 0)
-{
-    rtc::buffer<T> result(n);
-    std::mt19937 gen(seed);
-    std::uniform_real_distribution<double> dis(-1.0);
-    std::generate(result.begin(), result.end(), [&] { return dis(gen); });
-    return result;
-}
-
-template <class T, class U>
-bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
-{
-    return std::equal(a.begin(), a.end(), b.begin(), b.end(), [&](double x, double y) {
-        return fabs(x - y) < atol + rtol * fabs(y);
-    });
-}
-
-std::string classify(double x)
-{
-    switch(std::fpclassify(x))
-    {
-    case FP_INFINITE: return "inf";
-    case FP_NAN: return "nan";
-    case FP_NORMAL: return "normal";
-    case FP_SUBNORMAL: return "subnormal";
-    case FP_ZERO: return "zero";
-    default: return "unknown";
-    }
-}
-
-template <class Buffer>
-void print_classification(const Buffer& x)
-{
-    std::unordered_set<std::string> result;
-    for(const auto& i : x)
-        result.insert(classify(i));
-    for(const auto& c : result)
-        std::cout << c << ", ";
-    std::cout << std::endl;
-}
-
-template <class Buffer>
-void print_statistics(const Buffer& x)
-{
-    std::cout << "Min value: " << *std::min_element(x.begin(), x.end()) << ", ";
-    std::cout << "Max value: " << *std::max_element(x.begin(), x.end()) << ", ";
-    double num_elements = x.size();
-    auto mean =
-        std::accumulate(x.begin(), x.end(), double{0.0}, std::plus<double>{}) / num_elements;
-    auto stddev = std::sqrt(
-        std::accumulate(x.begin(),
-                        x.end(),
-                        double{0.0},
-                        [&](double r, double v) { return r + std::pow((v - mean), 2.0); }) /
-        num_elements);
-    std::cout << "Mean: " << mean << ", ";
-    std::cout << "StdDev: " << stddev << "\n";
-}
-
-template <class Buffer>
-void print_preview(const Buffer& x)
-{
-    if(x.size() <= 10)
-    {
-        std::for_each(x.begin(), x.end(), [&](double i) { std::cout << i << ", "; });
-    }
-    else
-    {
-        std::for_each(x.begin(), x.begin() + 5, [&](double i) { std::cout << i << ", "; });
-        std::cout << "..., ";
-        std::for_each(x.end() - 5, x.end(), [&](double i) { std::cout << i << ", "; });
-    }
-    std::cout << std::endl;
-}
-
-template <class T>
-struct check_all
-{
-    rtc::buffer<T> data{};
-    bool operator()(const rtc::buffer<T>& x)
-    {
-        if(data.empty())
-        {
-            data = x;
-            return true;
-        }
-        if(std::any_of(x.begin(), x.end(), [](double y) { return std::isnan(y); }))
-            return false;
-        return allclose(data, x);
-    }
-};
-
-template <class Solution>
-auto report(const Solution& solution, bool pass)
-{
-    return test::make_predicate(solution.ToTemplateString(), [=] { return pass; });
-}
 
 const std::string gemm_compile_check = R"__ck__(
 #include <${include}>
 
 extern "C" __global__ void f(const ck::half_t* a, const ck::half_t* b, ck::half_t* c) {
     using G = ${template};
-    constexpr auto desc = ${template}::make_descriptor(ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${k})),
+    constexpr auto desc = G::make_descriptor(ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${k})),
                                              ck::make_naive_tensor_descriptor(ck::make_tuple(${n}, ${k}), ck::make_tuple(1, ${n})),
                                              ck::make_tuple(),
                                              ck::make_naive_tensor_descriptor_packed(ck::make_tuple(${m}, ${n})));
@@ -166,15 +56,19 @@ TEST_CASE(test_problem_kernel)
     std::string epilogue = "";
     std::string prologue = "";
 
-    for(auto solution : prob.GetSolutions("gfx90a", prologue, epilogue))
+    auto solutions = prob.GetSolutions("gfx90a", prologue, epilogue);
+    std::cout << "Num solutions: " << solutions.size() << std::endl;
+    for(auto i = 0; i < solutions.size(); ++i)
     {
-        auto src  = ck::host::InterpolateString(gemm_compile_check,
+        std::cout << "Testing solution " << std::to_string(i + 1) << std::endl;
+        auto&& solution = solutions[i];
+        auto src        = ck::host::InterpolateString(gemm_compile_check,
                                                {{"include", prob.GetIncludeHeader()},
                                                 {"template", solution.ToTemplateString()},
                                                 {"m", std::to_string(prob.M)},
                                                 {"n", std::to_string(prob.N)},
                                                 {"k", std::to_string(prob.K)}});
-        auto srcs = get_headers_for_test();
+        auto srcs       = get_headers_for_test();
         srcs.push_back({"main.cpp", src});
         rtc::compile_options options;
         options.kernel_name = "f";
diff --git a/codegen/test/include/common.hpp b/codegen/test/include/common.hpp
index 24fde2e523..b3be592e74 100644
--- a/codegen/test/include/common.hpp
+++ b/codegen/test/include/common.hpp
@@ -2,27 +2,38 @@
 // Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
+
+#include "ck/host/headers.hpp"
+#include <rtc/compile_kernel.hpp>
+#include <rtc/hip.hpp>
+#include <test.hpp>
 #include <algorithm>
 #include <cmath>
 #include <iterator>
 #include <numeric>
 #include <random>
-#include <test.hpp>
-#include <rtc/compile_kernel.hpp>
-#include <rtc/hip.hpp>
-#include <fstream>
+#include <unordered_set>
 
-std::vector<rtc::src_file> get_headers_for_test()
+inline std::vector<rtc::src_file> create_headers_for_test()
 {
+    auto ck_headers = ck::host::GetHeaders();
     std::vector<rtc::src_file> result;
-    auto hs = ck::host::GetHeaders();
-    std::transform(
-        hs.begin(), hs.end(), std::back_inserter(result), [&](const auto& p) -> rtc::src_file {
-            return {p.first, p.second};
-        });
+    std::transform(ck_headers.begin(), ck_headers.end(), std::back_inserter(result), [](auto& p) {
+        std::string content;
+        content.reserve(p.second.size() + 1);
+        content.push_back(' '); // We need a whitespace before the content for hipRTC to work
+        content.append(p.second.data(), p.second.size());
+        return rtc::src_file{p.first, std::move(content)};
+    });
     return result;
 }
 
+inline const std::vector<rtc::src_file>& get_headers_for_test()
+{
+    static const std::vector<rtc::src_file> headers = create_headers_for_test();
+    return headers;
+}
+
 template <typename V>
 std::size_t GetSize(V mLens, V mStrides)
 {
@@ -37,18 +48,24 @@ std::size_t GetSize(V mLens, V mStrides)
     return space;
 }
 
-template <class T, typename V>
-rtc::buffer<T> generate_buffer(V mLens, V mStrides, std::size_t seed = 0)
+template <class T>
+rtc::buffer<T> generate_buffer(std::size_t n, std::size_t seed = 0)
 {
-    std::size_t space = GetSize(mLens, mStrides);
-    rtc::buffer<T> result(space);
+    rtc::buffer<T> result(n);
     std::mt19937 gen(seed);
     std::uniform_real_distribution<double> dis(-1.0);
     std::generate(result.begin(), result.end(), [&] { return dis(gen); });
-    // std::fill(result.begin(), result.end(), 1);
     return result;
 }
 
+template <class T, typename V>
+std::enable_if_t<!std::is_integral_v<V>, rtc::buffer<T>>
+generate_buffer(V mLens, V mStrides, std::size_t seed = 0)
+{
+    std::size_t space = GetSize(mLens, mStrides);
+    return generate_buffer<T>(space, seed);
+}
+
 template <class T, class U>
 bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
 {
@@ -57,7 +74,7 @@ bool allclose(const T& a, const U& b, double atol = 0.01, double rtol = 0.01)
     });
 }
 
-std::string classify(double x)
+inline std::string classify(double x)
 {
     switch(std::fpclassify(x))
     {
diff --git a/codegen/test/rtc/CMakeLists.txt b/codegen/test/rtc/CMakeLists.txt
index 68bfc2467b..2e7ceb5648 100644
--- a/codegen/test/rtc/CMakeLists.txt
+++ b/codegen/test/rtc/CMakeLists.txt
@@ -4,3 +4,9 @@ add_library(ck_rtc ${RTC_SOURCES})
 target_include_directories(ck_rtc PUBLIC include)
 target_link_libraries(ck_rtc PUBLIC hip::host)
 target_link_libraries(ck_rtc PUBLIC -lstdc++fs)
+
+option(USE_HIPRTC_FOR_CODEGEN_TESTS "Whether to enable hipRTC for codegen tests." ON)
+if(USE_HIPRTC_FOR_CODEGEN_TESTS)
+    target_compile_definitions(ck_rtc PUBLIC HIPRTC_FOR_CODEGEN_TESTS)
+    message("CK compiled with USE_HIPRTC_FOR_CODEGEN_TESTS set to ${USE_HIPRTC_FOR_CODEGEN_TESTS}")
+endif()
diff --git a/codegen/test/rtc/include/rtc/compile_kernel.hpp b/codegen/test/rtc/include/rtc/compile_kernel.hpp
index a49714f7c6..207f10a8e8 100644
--- a/codegen/test/rtc/include/rtc/compile_kernel.hpp
+++ b/codegen/test/rtc/include/rtc/compile_kernel.hpp
@@ -12,8 +12,9 @@ namespace rtc {
 
 struct src_file
 {
+    src_file(std::filesystem::path p, std::string c) : path{std::move(p)}, content{std::move(c)} {}
     fs::path path;
-    std::string_view content;
+    std::string content;
 };
 
 struct compile_options
@@ -22,7 +23,7 @@ struct compile_options
     std::string kernel_name = "main";
 };
 
-kernel compile_kernel(const std::vector<src_file>& src,
+kernel compile_kernel(const std::vector<src_file>& srcs,
                       compile_options options = compile_options{});
 
 } // namespace rtc
diff --git a/codegen/test/rtc/src/compile_kernel.cpp b/codegen/test/rtc/src/compile_kernel.cpp
index 5a70f898e8..a8da88be09 100644
--- a/codegen/test/rtc/src/compile_kernel.cpp
+++ b/codegen/test/rtc/src/compile_kernel.cpp
@@ -3,14 +3,41 @@
 
 #include <rtc/hip.hpp>
 #include <rtc/compile_kernel.hpp>
+#ifdef HIPRTC_FOR_CODEGEN_TESTS
+#include <hip/hiprtc.h>
+#include <rtc/manage_ptr.hpp>
+#endif
 #include <rtc/tmp_dir.hpp>
-#include <stdexcept>
-#include <iostream>
-#include <fstream>
+#include <algorithm>
 #include <cassert>
+#include <deque>
+#include <fstream>
+#include <iostream>
+#include <numeric>
+#include <stdexcept>
 
 namespace rtc {
 
+bool EndsWith(const std::string& value, const std::string& suffix)
+{
+    if(suffix.size() > value.size())
+        return false;
+    else
+        return std::equal(suffix.rbegin(), suffix.rend(), value.rbegin());
+}
+
+std::vector<std::string> SplitString(const std::string& s, char delim)
+{
+    std::vector<std::string> elems;
+    std::stringstream ss(s + delim);
+    std::string item;
+    while(std::getline(ss, item, delim))
+    {
+        elems.push_back(item);
+    }
+    return elems;
+}
+
 template <class T>
 T generic_read_file(const std::string& filename, size_t offset = 0, size_t nbytes = 0)
 {
@@ -62,7 +89,7 @@ std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip --cuda-device
 // TODO: undo after extracting the codeobj
 // std::string compiler() { return "/opt/rocm/llvm/bin/clang++ -x hip"; }
 
-kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options)
+kernel clang_compile_kernel(const std::vector<src_file>& srcs, compile_options options)
 {
     assert(not srcs.empty());
     tmp_dir td{"compile"};
@@ -103,4 +130,172 @@ kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options
     return kernel{obj.data(), options.kernel_name};
 }
 
+#ifdef HIPRTC_FOR_CODEGEN_TESTS
+
+std::string hiprtc_error(hiprtcResult err, const std::string& msg)
+{
+    return "hiprtc: " + (hiprtcGetErrorString(err) + (": " + msg));
+}
+
+void hiprtc_check_error(hiprtcResult err, const std::string& msg = "")
+{
+    if(err != HIPRTC_SUCCESS)
+        throw std::runtime_error(hiprtc_error(err, msg));
+}
+
+struct hiprtc_src_file
+{
+    hiprtc_src_file() = default;
+    hiprtc_src_file(const src_file& s) : path(s.path.string()), content(s.content) {}
+    std::string path;
+    std::string content;
+};
+
+void hiprtc_program_destroy(hiprtcProgram prog) { hiprtcDestroyProgram(&prog); }
+using hiprtc_program_ptr = RTC_MANAGE_PTR(hiprtcProgram, hiprtc_program_destroy);
+
+template <class... Ts>
+hiprtc_program_ptr hiprtc_program_create(Ts... xs)
+{
+    hiprtcProgram prog = nullptr;
+    auto result        = hiprtcCreateProgram(&prog, xs...);
+    hiprtc_program_ptr p{prog};
+    hiprtc_check_error(result, "Create program failed.");
+    return p;
+}
+
+struct hiprtc_program
+{
+    struct string_array
+    {
+        std::deque<std::string> strings{};
+        std::vector<const char*> c_strs{};
+
+        string_array() {}
+        string_array(const string_array&) = delete;
+
+        std::size_t size() const { return strings.size(); }
+
+        const char** data() { return c_strs.data(); }
+
+        void push_back(std::string s)
+        {
+            strings.push_back(std::move(s));
+            c_strs.push_back(strings.back().c_str());
+        }
+    };
+
+    hiprtc_program_ptr prog = nullptr;
+    string_array headers{};
+    string_array include_names{};
+    std::string cpp_src  = "";
+    std::string cpp_name = "";
+
+    hiprtc_program(const std::string& src, const std::string& name = "main.cpp")
+        : cpp_src(src), cpp_name(name)
+    {
+        create_program();
+    }
+
+    hiprtc_program(std::vector<src_file> srcs)
+    {
+        for(auto&& src : srcs)
+        {
+            if(EndsWith(src.path, ".cpp"))
+            {
+                cpp_src  = std::move(src.content);
+                cpp_name = std::move(src.path);
+            }
+            else
+            {
+                headers.push_back(std::move(src.content));
+                include_names.push_back(std::move(src.path));
+            }
+        }
+        create_program();
+    }
+
+    void create_program()
+    {
+        assert(not cpp_src.empty());
+        assert(not cpp_name.empty());
+        assert(headers.size() == include_names.size());
+        prog = hiprtc_program_create(cpp_src.c_str(),
+                                     cpp_name.c_str(),
+                                     headers.size(),
+                                     headers.data(),
+                                     include_names.data());
+    }
+
+    void compile(const std::vector<std::string>& options, bool quiet = false) const
+    {
+        std::vector<const char*> c_options;
+        std::transform(options.begin(),
+                       options.end(),
+                       std::back_inserter(c_options),
+                       [](const std::string& s) { return s.c_str(); });
+        auto result   = hiprtcCompileProgram(prog.get(), c_options.size(), c_options.data());
+        auto prog_log = log();
+        if(not prog_log.empty() and not quiet)
+        {
+            std::cerr << prog_log << std::endl;
+        }
+        if(result != HIPRTC_SUCCESS)
+            throw std::runtime_error("Compilation failed.");
+    }
+
+    std::string log() const
+    {
+        std::size_t n = 0;
+        hiprtc_check_error(hiprtcGetProgramLogSize(prog.get(), &n));
+        if(n == 0)
+            return {};
+        std::string buffer(n, '\0');
+        hiprtc_check_error(hiprtcGetProgramLog(prog.get(), buffer.data()));
+        assert(buffer.back() != 0);
+        return buffer;
+    }
+
+    std::vector<char> get_code_obj() const
+    {
+        std::size_t n = 0;
+        hiprtc_check_error(hiprtcGetCodeSize(prog.get(), &n));
+        std::vector<char> buffer(n);
+        hiprtc_check_error(hiprtcGetCode(prog.get(), buffer.data()));
+        return buffer;
+    }
+};
+
+std::vector<std::vector<char>> compile_hip_src_with_hiprtc(const std::vector<src_file>& srcs,
+                                                           const compile_options& options)
+{
+    hiprtc_program prog(srcs);
+    auto flags = SplitString(options.flags, ' ');
+    prog.compile(flags);
+    return {prog.get_code_obj()};
+}
+
+static kernel hiprtc_compile_kernel(const std::vector<src_file>& srcs, compile_options options)
+{
+    options.flags += " -I. -O3";
+    options.flags += " -std=c++17";
+    options.flags += " --offload-arch=" + get_device_name();
+    auto cos = compile_hip_src_with_hiprtc(srcs, options);
+    if(cos.size() != 1)
+        std::runtime_error("No code object");
+    auto& obj = cos.front();
+    return kernel{obj.data(), options.kernel_name};
+}
+
+#endif
+
+kernel compile_kernel(const std::vector<src_file>& srcs, compile_options options)
+{
+#ifdef HIPRTC_FOR_CODEGEN_TESTS
+    return hiprtc_compile_kernel(srcs, options);
+#else
+    return clang_compile_kernel(srcs, options);
+#endif
+}
+
 } // namespace rtc
diff --git a/docs/sphinx/requirements.in b/docs/sphinx/requirements.in
index e9df8c9f5f..d61b5e2b27 100644
--- a/docs/sphinx/requirements.in
+++ b/docs/sphinx/requirements.in
@@ -1,2 +1,2 @@
-rocm-docs-core==1.15.0
+rocm-docs-core==1.17.0
 sphinxcontrib-bibtex==2.6.3
diff --git a/docs/sphinx/requirements.txt b/docs/sphinx/requirements.txt
index a42fdf09bf..177f3ec184 100644
--- a/docs/sphinx/requirements.txt
+++ b/docs/sphinx/requirements.txt
@@ -199,7 +199,7 @@ requests==2.32.3
     # via
     #   pygithub
     #   sphinx
-rocm-docs-core==1.15.0
+rocm-docs-core==1.17.0
     # via -r requirements.in
 rpds-py==0.22.3
     # via
diff --git a/example/01_gemm/gemm_xdl_streamk.cpp b/example/01_gemm/gemm_xdl_streamk.cpp
index dbdf7199e8..01542c4775 100755
--- a/example/01_gemm/gemm_xdl_streamk.cpp
+++ b/example/01_gemm/gemm_xdl_streamk.cpp
@@ -27,11 +27,15 @@ using DeviceGemmStreamK = ck::tensor_operation::device::DeviceGemmXdlStreamK
 // ######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise|   Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
 // ######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|       |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
 // ######|          |          |          |            |        |        |        |            |            |            |       |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+    < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    256,   256,   128,     4, 16,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,      S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
+#else // defined(CK_USE_AMD_MFMA_GFX950)
     < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,      S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8>;
 
           // < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    256,   256,   128,     4,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,         1,           1,           1,               S<1, 32, 1, 8>,              8>;
       //  < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    128,   32,   64,     4,   8,  32,   32,    1,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 16, 1, 8>,              8>;
        // < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,    128,   32,   128,     4,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,         1,           1,           1,               S<1, 32, 1, 4>,              8>;
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
 
 
 
diff --git a/example/20_grouped_conv_bwd_weight/CMakeLists.txt b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
index 497ea19e11..6fbaee7dba 100644
--- a/example/20_grouped_conv_bwd_weight/CMakeLists.txt
+++ b/example/20_grouped_conv_bwd_weight/CMakeLists.txt
@@ -13,3 +13,9 @@ add_example_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bw
 
 add_example_executable(example_grouped_conv_bwd_weight_dl_fp16 grouped_conv_bwd_weight_dl_fp16.cpp)
 add_example_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_dl_fp16)
+
+add_example_executable(example_grouped_conv_bwd_weight_v3_xdl_bf16 grouped_conv_bwd_weight_v3_xdl_bf16.cpp)
+add_example_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_v3_xdl_bf16)
+
+add_example_executable(example_grouped_conv_bwd_weight_v3_xdl_fp16 grouped_conv_bwd_weight_v3_xdl_fp16.cpp)
+add_example_dependencies(example_grouped_conv_bwd_weight example_grouped_conv_bwd_weight_v3_xdl_fp16)
diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_v3_xdl_bf16.cpp b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_v3_xdl_bf16.cpp
new file mode 100644
index 0000000000..c0168bf117
--- /dev/null
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_v3_xdl_bf16.cpp
@@ -0,0 +1,102 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp"
+
+using InDataType = BF16;
+// bf16 kernel use fp32 atomic add to accumulate Weight tensor into global memory
+using WeiDataType = F32;
+using OutDataType = BF16;
+using AccDataType = F32;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = PassThrough;
+
+template <ck::index_t NDimSpatial>
+using DeviceConvBwdWeightInstance =
+    // clang-format on
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<
+        NDimSpatial,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                      ck::tensor_layout::convolution::GNHWC,
+                                      ck::tensor_layout::convolution::GNDHWC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                      ck::tensor_layout::convolution::GKYXC,
+                                      ck::tensor_layout::convolution::GKZYXC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                      ck::tensor_layout::convolution::GNHWK,
+                                      ck::tensor_layout::convolution::GNDHWK>>,
+        InDataType,           // InDataType
+        WeiDataType,          // WeiDataType
+        OutDataType,          // OutDataType
+        AccDataType,          // AccDataType
+        InElementOp,          // InElementwiseOperation
+        WeiElementOp,         // WeiElementwiseOperation
+        OutElementOp,         // OutElementwiseOperation
+        ConvBwdWeightDefault, // ConvolutionBackwardWeightSpecialization
+        256,                  // BlockSize
+        128,                  // MPerBlock
+        128,                  // NPerBlock
+        32,                   // K0PerBlock
+        8,                    // K1
+        32,                   // MPerXdl
+        32,                   // NPerXdl
+        2,                    // MXdlPerWave
+        2,                    // NXdlPerWave
+        S<4, 16, 1>,          // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<2, 0, 1>,           // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,           // ABlockTransferSrcAccessOrder
+        2,                    // ABlockTransferSrcVectorDim
+        1,                    // ABlockTransferSrcScalarPerVector
+        2,                    // ABlockTransferDstScalarPerVector_K1
+        true,                 // ABlockLdsAddExtraM
+        S<4, 16, 1>,          // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<2, 0, 1>,           // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,           // BBlockTransferSrcAccessOrder
+        2,                    // BBlockTransferSrcVectorDim
+        1,                    // BBlockTransferSrcScalarPerVector
+        2,                    // BBlockTransferDstScalarPerVector_K1
+        true,                 // BBlockLdsAddExtraN
+        1,                    // CShuffleMXdlPerWavePerShuffle
+        1,                    // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
+                                                 // clang-format off
+
+template <ck::index_t NDimSpatial>
+using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+                                                                                     InDataType,
+                                                                                     WeiDataType,
+                                                                                     OutDataType,
+                                                                                     InElementOp,
+                                                                                     WeiElementOp,
+                                                                                     OutElementOp>;
+
+#include "run_grouped_conv_bwd_weight_example.inc"
+
+int main(int argc, char* argv[])
+{
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_param = DefaultConvParam;
+
+    if(!parse_cmd_args(argc, argv, config, conv_param))
+    {
+        return 1;
+    }
+
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1: return !run_grouped_conv_bwd_weight<1>(config, conv_param);
+    case 2: return !run_grouped_conv_bwd_weight<2>(config, conv_param);
+    case 3: return !run_grouped_conv_bwd_weight<3>(config, conv_param);
+    default: break;
+    }
+
+    return 1;
+}
diff --git a/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_v3_xdl_fp16.cpp b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_v3_xdl_fp16.cpp
new file mode 100644
index 0000000000..4b9fd356df
--- /dev/null
+++ b/example/20_grouped_conv_bwd_weight/grouped_conv_bwd_weight_v3_xdl_fp16.cpp
@@ -0,0 +1,99 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "common.hpp"
+
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp"
+
+using InDataType  = F16;
+using WeiDataType = F16;
+using OutDataType = F16;
+using AccDataType = F32;
+
+using InElementOp  = PassThrough;
+using WeiElementOp = PassThrough;
+using OutElementOp = PassThrough;
+
+template <ck::index_t NDimSpatial>
+using DeviceConvBwdWeightInstance =
+    ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3<
+        NDimSpatial,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWC,
+                                      ck::tensor_layout::convolution::GNHWC,
+                                      ck::tensor_layout::convolution::GNDHWC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GKXC,
+                                      ck::tensor_layout::convolution::GKYXC,
+                                      ck::tensor_layout::convolution::GKZYXC>>,
+        ck::tuple_element_t<NDimSpatial - 1,
+                            ck::Tuple<ck::tensor_layout::convolution::GNWK,
+                                      ck::tensor_layout::convolution::GNHWK,
+                                      ck::tensor_layout::convolution::GNDHWK>>,
+        InDataType,           // InDataType
+        WeiDataType,          // WeiDataType
+        OutDataType,          // OutDataType
+        AccDataType,          // AccDataType
+        InElementOp,          // InElementwiseOperation
+        WeiElementOp,         // WeiElementwiseOperation
+        OutElementOp,         // OutElementwiseOperation
+        ConvBwdWeightDefault, // ConvolutionBackwardWeightSpecialization
+        256,                  // BlockSize
+        128,                  // MPerBlock
+        128,                  // NPerBlock
+        32,                   // K0PerBlock
+        8,                    // K1
+        32,                   // MPerXdl
+        32,                   // NPerXdl
+        2,                    // MXdlPerWave
+        2,                    // NXdlPerWave
+        S<4, 16, 1>,          // ABlockTransferThreadClusterLengths_K0_M_K1
+        S<2, 0, 1>,           // ABlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,           // ABlockTransferSrcAccessOrder
+        2,                    // ABlockTransferSrcVectorDim
+        1,                    // ABlockTransferSrcScalarPerVector
+        2,                    // ABlockTransferDstScalarPerVector_K1
+        false,                // ABlockLdsAddExtraM
+        S<4, 16, 1>,          // BBlockTransferThreadClusterLengths_K0_N_K1
+        S<2, 0, 1>,           // BBlockTransferThreadClusterArrangeOrder
+        S<1, 0, 2>,           // BBlockTransferSrcAccessOrder
+        2,                    // BBlockTransferSrcVectorDim
+        1,                    // BBlockTransferSrcScalarPerVector
+        2,                    // BBlockTransferDstScalarPerVector_K1
+        false,                // BBlockLdsAddExtraN
+        1,                    // CShuffleMXdlPerWavePerShuffle
+        1,                    // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 4>,       // CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        128 / (sizeof(WeiDataType) * CHAR_BIT)>; // CBlockTransferScalarPerVector_NWaveNPerXdl
+
+template <ck::index_t NDimSpatial>
+using HostConvBwdWeightInstance = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
+                                                                                     InDataType,
+                                                                                     WeiDataType,
+                                                                                     OutDataType,
+                                                                                     InElementOp,
+                                                                                     WeiElementOp,
+                                                                                     OutElementOp>;
+
+#include "run_grouped_conv_bwd_weight_example.inc"
+
+int main(int argc, char* argv[])
+{
+    ExecutionConfig config;
+    ck::utils::conv::ConvParam conv_param = DefaultConvParam;
+
+    if(!parse_cmd_args(argc, argv, config, conv_param))
+    {
+        return 1;
+    }
+
+    switch(conv_param.num_dim_spatial_)
+    {
+    case 1: return !run_grouped_conv_bwd_weight<1>(config, conv_param);
+    case 2: return !run_grouped_conv_bwd_weight<2>(config, conv_param);
+    case 3: return !run_grouped_conv_bwd_weight<3>(config, conv_param);
+    default: break;
+    }
+
+    return 1;
+}
diff --git a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
index c3981c0211..9a81ef5ea7 100644
--- a/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
+++ b/example/65_gemm_multiply_multiply/gemm_multiply_multiply_xdl_fp8_bpreshuffle.cpp
@@ -136,30 +136,18 @@ using CDEElementOp = MultiplyMultiply;
 
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::Default;
 
-// using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3
 using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle
     // clang-format off
-///######|  ALayout|  BLayout| DsLayout| ELayout|      AData|      BData|     DsData|     EData|     AccData|         CShuffle|           A|           B|          CDE|           GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-///######|         |         |         |        |       Type|       Type|       Type|      Type|        Type|         DataType| Elementwise| Elementwise|  Elementwise| Spacialization|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
-///######|         |         |         |        |           |           |           |          |            |                 |   Operation|   Operation|    Operation|               |      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
-///######|         |         |         |        |           |           |           |          |            |                 |            |            |             |               |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |    S<C, D0, D1>|
-///###### RCR
-        // kernel 1: 256->32x128x128 
-        // <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   32,   128,    128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 32, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 32, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, FP8>;
-        // <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,   32,   128,    256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<16, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 32, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
-        <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
-               AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   256,
-               32,   128,    256,
-               16,   16,
-               32,   32,
-               1,    1,
-               S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-               S<16, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
-               1,    1,   S<1, 16, 1, 16>, S<8, 8, 1>,
-               ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v1, FP8>;
-        // kernel 2: 128->32x128x128
-        //  <      Row,      Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,  AElementOp,  BElementOp, CDEElementOp,       GemmSpec,   128,   32,   128,    128,  16,  16,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,     S<8, 16, 1>,    S<1, 0, 2>,     S<1, 0, 2>,             2,              16,             16,          0,          1,           1,               S<1, 16, 1, 8>,      S<8, 8, 1>,  ck::BlockGemmPipelineScheduler::Interwave, ck::BlockGemmPipelineVersion::v1, FP8>;
-
+    <   Row, Col, DsLayout, ELayout, A0DataType, B0DataType, DsDataType, EDataType, AccDataType, CShuffleDataType,
+        AElementOp,  BElementOp, CDEElementOp, GemmSpec, 256,
+        256,   256,    128,
+        16,   16,
+        16,   16,
+        8,    8,
+        S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+        S<8, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 16, 16, 0,
+        1,    2,   S<1, 32, 1, 8>, S<8, 8, 1>,
+        ck::BlockGemmPipelineScheduler::Intrawave, ck::BlockGemmPipelineVersion::v3, FP8>;
 // clang-format on
 
 int main(int argc, char* argv[])
@@ -180,6 +168,9 @@ int main(int argc, char* argv[])
 
     ck::index_t KBatch = 1;
 
+    ck::index_t Warmup = 50;
+    ck::index_t Repeat = 50;
+
     if(argc == 1)
     {
         // use default case
@@ -207,6 +198,26 @@ int main(int argc, char* argv[])
 
         KBatch = std::stoi(argv[11]);
     }
+    else if(argc == 14)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideD = std::stoi(argv[9]);
+        StrideE = std::stoi(argv[10]);
+
+        KBatch = std::stoi(argv[11]);
+
+        Warmup = std::stoi(argv[12]);
+        Repeat = std::stoi(argv[13]);
+    }
     else
     {
         printf("arg1: verification (0=no, 1=yes)\n");
@@ -214,6 +225,7 @@ int main(int argc, char* argv[])
         printf("arg3: time kernel (0=no, 1=yes)\n");
         printf(
             "arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideD, StrideE, KBatch\n");
+        printf("arg10 to 11: Warmup, Repeat\n");
         exit(0);
     }
 
@@ -321,7 +333,14 @@ int main(int argc, char* argv[])
             "not support this GEMM problem");
     }
 
-    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel, 0, 50, 50, true, 50});
+    size_t total_size =
+        (M * K * sizeof(A0DataType) + N * K * sizeof(B0DataType) + M * sizeof(D0DataType) +
+         N * sizeof(D1DataType) + M * N * sizeof(EDataType));
+    int rotate_buf_num =
+        ck::math::min(size_t(Repeat), ck::math::integer_divide_ceil(512 * 1024 * 1024, total_size));
+
+    float ave_time = invoker.Run(
+        argument, StreamConfig{nullptr, time_kernel, 0, Warmup, Repeat, true, rotate_buf_num});
 
     std::size_t flop = std::size_t(2) * M * N * K;
     std::size_t num_btype =
diff --git a/example/67_gemm_microscaling/gemm_mx_common.hpp b/example/67_gemm_microscaling/gemm_mx_common.hpp
index 5b00b5a123..30f03cb53b 100644
--- a/example/67_gemm_microscaling/gemm_mx_common.hpp
+++ b/example/67_gemm_microscaling/gemm_mx_common.hpp
@@ -13,7 +13,9 @@
 #include "ck/utility/blkgemmpipe_scheduler.hpp"
 #include "ck/utility/data_type.hpp"
 #include "ck/utility/sequence.hpp"
+
 #include "ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp"
+
 #include "ck/library/utility/check_err.hpp"
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/fill.hpp"
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
index c05660c8ab..4c23250d05 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py
@@ -492,6 +492,7 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
             if kernel_filter != None:
                 if not fnmatch.fnmatch(k.name, kernel_filter):
                     continue
+            # Flash attention integration
             if receipt == 2:
                     cond = dtype in ['fp16', 'bf16']
                     cond &= bias in ['no', 'alibi']
@@ -499,14 +500,15 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond &= dpad == dvpad
                     if not cond:
                         continue
-            if receipt == 3:
+            elif receipt == 3:
                     cond = dtype in ['fp16', 'bf16']
                     cond &= bias in ['no', 'alibi']
                     cond &= dpad == dvpad
                     cond &= deterministic == "f"
                     if not cond:
                         continue
-            if receipt == 4:
+            # PyTorch integration
+            elif receipt == 4:
                     cond = dtype in ['fp16', 'bf16']
                     cond &= bias in ['no', 'bias']
                     cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
@@ -514,6 +516,26 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond &= deterministic == "f"
                     if not cond:
                         continue
+            # Aiter (mha_bwd) integration
+            elif receipt == 10:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "batch"
+                    cond &= bias in ['no', 'alibi']
+                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                    cond &= dpad == dvpad
+                    cond &= deterministic == "t"
+                    if not cond:
+                        continue
+            # Aiter (mha_varlen_bwd) integration
+            elif receipt == 11:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "group"
+                    cond &= bias in ['no', 'alibi']
+                    cond &= dropout in ['no', 'dropout_wg32',  'dropout_wg16']
+                    cond &= dpad == dvpad
+                    cond &= deterministic == "t"
+                    if not cond:
+                        continue
             api_pool.register_dq_dk_dv_traits(k.api_trait())
             gen.append(k)
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
index ad8daba17e..b72627ed5d 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py
@@ -487,6 +487,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                 if kernel_filter != None:
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
+                # 2 - Flash attention integration
                 if receipt in (2, 3):
                     cond = dtype in ['fp16', 'bf16']
                     cond &= pipeline.F_vlayout == 'row'
@@ -494,13 +495,32 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
-                if receipt == 4:
+                # PyTorch integration
+                elif receipt == 4:
                     cond = dtype in ['fp16', 'bf16']
                     cond &= pipeline.F_vlayout == 'row'
                     cond &= pipeline.F_bias in ['no', 'bias']
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
+                # Aiter(mha_fwd) integration
+                elif receipt == 10:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "batch"
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'alibi']
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_varlen_fwd) integration
+                elif receipt == 11:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "group"
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'alibi']
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
index 2f20819302..f8a89448ba 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py
@@ -326,7 +326,9 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                 if kernel_filter != None:
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
-                if receipt == 2:
+                # 2 - Flash attention integration
+                # 12 - Aiter(mha_fwd_kvcache) integration
+                if receipt in (2, 12):
                     cond = dtype in ['fp16', 'bf16']
                     cond &= pipeline.F_vlayout == 'row'
                     if not cond:
diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
index 37745dd382..c0ca666b11 100644
--- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
+++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py
@@ -268,7 +268,7 @@ float fmha_fwd_splitkv(fmha_fwd_splitkv_traits t, fmha_fwd_splitkv_args a, const
 FMHA_FWD_SPLITKV_API_INNER_DISPATCH="""            {F_if}((t.is_group_mode == {F_mode}) && (t.is_v_rowmajor == {F_vlayout}) && ({F_mask_check}) && (t.bias_type == {F_bias_check}) && (t.do_fp8_static_quant == {F_squant}) &&
                         ((a.block_table_ptr != nullptr) == {F_pagedkv}) && ({F_scheck}) && ({F_skcheck}) && ({F_dcheck}) && ({F_dvcheck})) {{
                 using traits_ = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, {F_pipeline_enum}, {F_mask}, {F_bias}, true, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>;
-                
+
                 // get combine kernel tile sizes
                 using OaccDataType = typename FmhaFwdTypeConfig<{F_dtype}>::OaccDataType;
                 constexpr ck_tile::index_t kM0 = ck_tile::BlockFmhaSplitKVCombinePipelineTileSizes<OaccDataType, /*F_bn1=*/32>::kM0;
@@ -705,6 +705,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                 if kernel_filter != None:
                     if not fnmatch.fnmatch(k.name, kernel_filter):
                         continue
+                # Flash attention integration
                 if receipt == 2:
                     cond = dtype in ['fp16', 'bf16']
                     cond &= pipeline.F_vlayout == 'row'
@@ -712,6 +713,24 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) ->
                     cond &= pipeline.F_squant == 'f'
                     if not cond:
                         continue
+                # Aiter(mha_varlen_fwd) integration
+                elif receipt == 11:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "group"
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'alibi']
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
+                # Aiter(mha_fwd_kvcache) integration
+                elif receipt == 12:
+                    cond = dtype in ['fp16', 'bf16']
+                    cond &= mode == "batch"
+                    cond &= pipeline.F_vlayout == 'row'
+                    cond &= pipeline.F_bias in ['no', 'alibi']
+                    cond &= pipeline.F_squant == 'f'
+                    if not cond:
+                        continue
                 api_pool.register_traits(k.api_trait())
                 gen.append(k)
 
diff --git a/example/ck_tile/01_fmha/generate.py b/example/ck_tile/01_fmha/generate.py
index a0fb42aa11..0c2cef1ce7 100644
--- a/example/ck_tile/01_fmha/generate.py
+++ b/example/ck_tile/01_fmha/generate.py
@@ -17,7 +17,7 @@ class HandlerId(IntEnum):
     LIST_BLOBS = 0
     WRITE_BLOBS = 1
 
-# inspect all modules under 'codegen.ops' and register API handlers 
+# inspect all modules under 'codegen.ops' and register API handlers
 ops = []
 for importer, module_name, _ in pkgutil.iter_modules(codegen.ops.__path__):
     full_module_name = '%s.%s' % (codegen.ops.__name__, module_name)
@@ -104,7 +104,11 @@ if __name__ == "__main__":
         help="codegen receipt. 0: generate only 8xhdim coverage\n"  + \
              "  1: generate more instance to cover all hdim\n"  + \
              "  2: Only generate instance for Flash attention integration\n"  + \
-             "  4: Only generate instance for PyTorch integration"
+             "  4: Only generate instance for PyTorch integration\n" + \
+             " 10: Only generate instance for Aiter(mha_fwd, mha_bwd) integration\n" + \
+             " 11: Only generate instance for Aiter(mha_varlen_fwd, mha_varlen_bwd) integration\n" + \
+             " 12: Only generate instance for Aiter(mha_fwd_kvcache) integration"
+      
     )
 
     args = parser.parse_args()
diff --git a/example/ck_tile/03_gemm/gemm_basic.hpp b/example/ck_tile/03_gemm/gemm_basic.hpp
index 636b34981f..dbc582e5a3 100644
--- a/example/ck_tile/03_gemm/gemm_basic.hpp
+++ b/example/ck_tile/03_gemm/gemm_basic.hpp
@@ -35,7 +35,7 @@
 #error "unsupported CK_TILE_PIPELINE_DEFAULT value"
 #endif
 
-template <typename DataType>
+template <typename ADataType, typename BDataType = ADataType, typename CDataType = ADataType>
 struct GemmBasicTypeConfig;
 
 template <>
@@ -75,6 +75,15 @@ struct GemmBasicTypeConfig<ck_tile::bf8_t>
     using CDataType   = ck_tile::half_t;
 };
 
+template <>
+struct GemmBasicTypeConfig<ck_tile::half_t, ck_tile::pk_int4_t, ck_tile::half_t>
+{
+    using ADataType   = ck_tile::half_t;
+    using BDataType   = ck_tile::pk_int4_t;
+    using AccDataType = float;
+    using CDataType   = ck_tile::half_t;
+};
+
 template <typename T>
 struct DataTypeTraits;
 
@@ -114,6 +123,12 @@ struct DataTypeTraits<ck_tile::bf8_t>
     static constexpr const char* name = "bf8";
 };
 
+template <>
+struct DataTypeTraits<ck_tile::pk_int4_t>
+{
+    static constexpr const char* name = "pk_int4_t";
+};
+
 auto create_args(int argc, char* argv[])
 {
     ck_tile::ArgParser arg_parser;
diff --git a/example/ck_tile/03_gemm/run_gemm_example.inc b/example/ck_tile/03_gemm/run_gemm_example.inc
index 042ad372dc..f068cbc1da 100644
--- a/example/ck_tile/03_gemm/run_gemm_example.inc
+++ b/example/ck_tile/03_gemm/run_gemm_example.inc
@@ -29,6 +29,60 @@ auto calculate_rtol_atol(const ck_tile::index_t K,
     // Use higher threshold
     return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
 }
+template <typename Tensor>
+void permute_tensor_b(Tensor& tensor)
+{
+    const ck_tile::index_t K = tensor.get_length(0);
+    const ck_tile::index_t N = tensor.get_length(1);
+    // vector pk_i4x4 permute
+    for(int i = 0; i < N; i++)
+    {
+        for(int j = 0; j < K; j += 8)
+        {
+            int8_t input[8];
+
+            for(int k = 0; k < 4; k++)
+            {
+                int8_t i4x2      = tensor(j + k * 2, i).data;
+                input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+            }
+
+            // permute 01234567->20643175
+            {
+                int8_t hi   = input[2];
+                int8_t lo   = input[0];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 0, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[6];
+                int8_t lo   = input[4];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 2, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[3];
+                int8_t lo   = input[1];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 4, i) = i4x2;
+            }
+
+            {
+                int8_t hi   = input[7];
+                int8_t lo   = input[5];
+                int8_t i4x2 = (hi << 4) | lo;
+
+                tensor(j + 6, i) = i4x2;
+            }
+        }
+    }
+}
 
 template <typename ADataType,
           typename BDataType,
@@ -83,7 +137,12 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
     return ave_time;
 }
 
-template <typename PrecType, typename ALayout, typename BLayout, typename CLayout>
+template <typename ADataType,
+          typename BDataType = ADataType,
+          typename CDataType = ADataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout>
 int run_gemm_example_with_layouts(int argc,
                                   char* argv[],
                                   const ALayout a_layout                  = ALayout{},
@@ -94,10 +153,7 @@ int run_gemm_example_with_layouts(int argc,
     if(!result)
         return -1;
 
-    using ADataType   = typename GemmBasicTypeConfig<PrecType>::ADataType;
-    using BDataType   = typename GemmBasicTypeConfig<PrecType>::BDataType;
-    using CDataType   = typename GemmBasicTypeConfig<PrecType>::CDataType;
-    using AccDataType = typename GemmBasicTypeConfig<PrecType>::AccDataType;
+    using AccDataType = typename GemmBasicTypeConfig<ADataType, BDataType, CDataType>::AccDataType;
 
     ck_tile::index_t M = arg_parser.get_int("m");
     ck_tile::index_t N = arg_parser.get_int("n");
@@ -107,10 +163,10 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::index_t stride_B = arg_parser.get_int("stride_b");
     ck_tile::index_t stride_C = arg_parser.get_int("stride_c");
 
-    ck_tile::index_t kbatch = arg_parser.get_int("split_k");
-    int n_warmup            = arg_parser.get_int("warmup");
-    int n_repeat            = arg_parser.get_int("repeat");
-    ck_tile::index_t init_method = arg_parser.get_int("init");  
+    ck_tile::index_t kbatch      = arg_parser.get_int("split_k");
+    int n_warmup                 = arg_parser.get_int("warmup");
+    int n_repeat                 = arg_parser.get_int("repeat");
+    ck_tile::index_t init_method = arg_parser.get_int("init");
 
     stride_A = ck_tile::get_default_stride(M, K, stride_A, is_row_major(a_layout));
     stride_B = ck_tile::get_default_stride(K, N, stride_B, is_row_major(b_layout));
@@ -123,16 +179,23 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::HostTensor<CDataType> c_m_n_dev_result(
         ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
 
-    if (init_method == 0) {
+    if(init_method == 0)
+    {
         ck_tile::FillUniformDistribution<ADataType>{-1.f, 1.f}(a_m_k);
         ck_tile::FillUniformDistribution<BDataType>{-1.f, 1.f}(b_k_n);
-    } else if (init_method == 1) {
+    }
+    else if(init_method == 1)
+    {
         ck_tile::FillMonotonicSeq<ADataType>{}(a_m_k);
         ck_tile::FillMonotonicSeq<BDataType>{}(b_k_n);
-    } else if (init_method == 2) {
+    }
+    else if(init_method == 2)
+    {
         ck_tile::FillConstant<ADataType>{static_cast<ADataType>(1)}(a_m_k);
         ck_tile::FillConstant<BDataType>{static_cast<BDataType>(1)}(b_k_n);
-    } else {
+    }
+    else
+    {
         a_m_k.SetZero();
         b_k_n.SetZero();
     }
@@ -142,7 +205,17 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
 
     a_m_k_dev_buf.ToDevice(a_m_k.data());
-    b_k_n_dev_buf.ToDevice(b_k_n.data());
+    if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+    {
+        // Permute data for device implementation
+        ck_tile::HostTensor<BDataType> b_k_n_dev = b_k_n;
+        permute_tensor_b(b_k_n_dev);
+        b_k_n_dev_buf.ToDevice(b_k_n_dev.data());
+    }
+    else
+    {
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+    }
     c_m_n_dev_buf.SetZero();
     c_m_n_dev_result.SetZero();
 
@@ -188,6 +261,11 @@ int run_gemm_example_with_layouts(int argc,
     }
     else if(arg_parser.get_int("v") == 2)
     {
+        if constexpr(std::is_same_v<BDataType, ck_tile::pk_int4_t>)
+        {
+            // Restore input for B for gpu reference
+            b_k_n_dev_buf.ToDevice(b_k_n.data());
+        }
         ck_tile::HostTensor<CDataType> c_m_n_gpu_ref(
             ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{})));
         ck_tile::DeviceMem c_m_n_gpu_buf_ref(c_m_n_gpu_ref.get_element_space_size_in_bytes());
@@ -198,17 +276,18 @@ int run_gemm_example_with_layouts(int argc,
         BDataType* d_B;
         CDataType* d_C;
 
-        ck_tile::hip_check_error(hipMalloc(&d_A, M * K * sizeof(ADataType)));
-        ck_tile::hip_check_error(hipMalloc(&d_B, N * K * sizeof(BDataType)));
-        ck_tile::hip_check_error(hipMalloc(&d_C, M * N * sizeof(CDataType)));
+        ck_tile::hip_check_error(hipMalloc(&d_A, a_m_k.get_element_space_size_in_bytes()));
+        ck_tile::hip_check_error(hipMalloc(&d_B, b_k_n.get_element_space_size_in_bytes()));
+        ck_tile::hip_check_error(
+            hipMalloc(&d_C, c_m_n_dev_result.get_element_space_size_in_bytes()));
 
         ck_tile::hip_check_error(hipMemcpy(d_A,
                                            a_m_k_dev_buf.GetDeviceBuffer(),
-                                           M * K * sizeof(ADataType),
+                                           a_m_k.get_element_space_size_in_bytes(),
                                            hipMemcpyHostToDevice));
         ck_tile::hip_check_error(hipMemcpy(d_B,
                                            b_k_n_dev_buf.GetDeviceBuffer(),
-                                           N * K * sizeof(BDataType),
+                                           b_k_n.get_element_space_size_in_bytes(),
                                            hipMemcpyHostToDevice));
 
         ck_tile::reference_gemm_gpu<ADataType,
@@ -221,7 +300,7 @@ int run_gemm_example_with_layouts(int argc,
 
         ck_tile::hip_check_error(hipMemcpy(c_m_n_gpu_buf_ref.GetDeviceBuffer(),
                                            d_C,
-                                           M * N * sizeof(CDataType),
+                                           c_m_n_dev_result.get_element_space_size_in_bytes(),
                                            hipMemcpyDeviceToHost));
 
         ck_tile::hip_check_error(hipFree(d_A));
diff --git a/example/ck_tile/03_gemm/universal_gemm.cpp b/example/ck_tile/03_gemm/universal_gemm.cpp
index d1b79177f5..ab763437e5 100644
--- a/example/ck_tile/03_gemm/universal_gemm.cpp
+++ b/example/ck_tile/03_gemm/universal_gemm.cpp
@@ -321,6 +321,15 @@ int run_gemm_example(int argc, char* argv[])
         {
             return run_gemm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Row{}, Col{}, Row{});
         }
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+        else if(data_type == "pk_int4_t")
+        {
+            // TODO: Add support for bhalf_t ADataType
+            return run_gemm_example_with_layouts<ck_tile::half_t,
+                                                 ck_tile::pk_int4_t,
+                                                 ck_tile::half_t>(argc, argv, Row{}, Col{}, Row{});
+        }
+#endif
         else
         {
             throw std::runtime_error("Unsupported data_type!");
@@ -344,6 +353,15 @@ int run_gemm_example(int argc, char* argv[])
         {
             return run_gemm_example_with_layouts<ck_tile::bf8_t>(argc, argv, Col{}, Col{}, Row{});
         }
+#if(CK_TILE_PIPELINE_DEFAULT == CK_TILE_PIPELINE_COMPUTE_V3)
+        else if(data_type == "pk_int4_t")
+        {
+            // TODO: Add support for bhalf_t ADataType
+            return run_gemm_example_with_layouts<ck_tile::half_t,
+                                                 ck_tile::pk_int4_t,
+                                                 ck_tile::half_t>(argc, argv, Col{}, Col{}, Row{});
+        }
+#endif
         else
         {
             throw std::runtime_error("Unsupported data_type!");
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting.cpp b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
index e470f4142c..69d211236f 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting.cpp
@@ -152,6 +152,13 @@ bool test_moe_sorting(ck_tile::ArgParser args)
     if(local_expert_masking)
         local_expert_masking_dev.ToDevice(local_expert_masking_host.data());
 
+    // if return zero, means no need workspace, can set moe_sorting_args.p_ws to nullptr
+    ck_tile::index_t workspace_size = moe_sorting_get_workspace_size(tokens, num_experts);
+    ck_tile::DeviceMem moe_sorting_ws(workspace_size != 0 ? workspace_size : 0);
+
+    if(workspace_size != 0)
+        moe_sorting_ws.SetZero(); // note, clear here!!!!
+
     moe_sorting_trait trait{index_prec, weight_prec, local_expert_masking};
 
     moe_sorting_args karg{topk_ids_dev.GetDeviceBuffer(),
@@ -163,6 +170,7 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                           sorted_expert_ids_dev.GetDeviceBuffer(),
                           sorted_id_cnt_dev.GetDeviceBuffer(),
                           moe_buf_size > 0 ? moe_buf_dev.GetDeviceBuffer() : nullptr,
+                          workspace_size != 0 ? moe_sorting_ws.GetDeviceBuffer() : nullptr,
                           tokens,
                           unit_size,
                           num_experts,
@@ -174,13 +182,68 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                               /* log_level = */ (kname ? 1 : 0),
                               warmup,
                               repeat};
+
     auto ms = moe_sorting(trait, karg, sc);
-    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d, ",
+    // auto ms = moe_sorting_mp(trait, karg, sc);
+
+#if 0
+    {
+        ck_tile::HostTensor<char> ws_host({workspace_size}, {1});
+        moe_sorting_ws.FromDevice(ws_host.data());
+
+        int * p_mesh = reinterpret_cast<int*>(ws_host.data());
+        ck_tile::index_t row_size = ck_tile::impl::moe_sorting_mp_mesh_stride(tokens);
+
+        std::cout << "topk_ids:" << std::endl;
+
+        int * p_topk_ids = reinterpret_cast<int*>(topk_ids_host.data());
+        for(int i_token = 0; i_token < tokens; i_token++) {
+            printf("[t:%2d]", i_token);
+            for(int i_topk = 0; i_topk < topk; i_topk++) {
+                printf("%d, ",p_topk_ids[i_token * topk + i_topk] );
+            }
+            printf("\n");
+        }
+        printf("----------------\n");
+
+        std::vector<int> l_cumsum (num_experts + 1, 0);
+        for(int i_expert = 0; i_expert < num_experts; i_expert++  ) {
+            printf("[e:%2d]", i_expert);
+            int e_cnt = 0;
+            for(int i_token = 0; i_token < tokens; i_token++) {
+                auto v_mesh = p_mesh[i_expert * row_size + i_token];
+                e_cnt += v_mesh != 0 ? 1 : 0;
+                printf("%d, ", v_mesh); 
+            }
+            int e_cnt_unit = (e_cnt + unit_size - 1) / unit_size;
+            printf("[%d/%d]", e_cnt, e_cnt_unit);
+            printf("\n");
+            l_cumsum[i_expert + 1] = l_cumsum[i_expert] + e_cnt_unit;
+        }
+
+        printf("----------------\n");
+        printf("cumsum:\n");
+        for(int i_cc= 0; i_cc < num_experts + 1; i_cc++) {
+            printf("%2d, ", l_cumsum[i_cc]);
+        }
+        printf("\n");
+        printf("----------------\n");
+
+        int * p_cumsum = p_mesh + ck_tile::impl::moe_sorting_mp_mesh_elem(tokens, num_experts);
+        for(int i_expert = 0; i_expert < num_experts + 1; i_expert++  ) {
+            printf("%2d(%d), ",p_cumsum[i_expert], p_cumsum[i_expert] / unit_size);
+        }
+        printf("\n");
+    }
+#endif
+
+    printf("[%s|%s]tokens:%d, num_experts:%d, topk:%d, mp:%d, ",
            index_prec.c_str(),
            weight_prec.c_str(),
            tokens,
            num_experts,
-           topk);
+           topk,
+           workspace_size != 0 ? 1 : 0);
 
     if(local_expert_masking)
     {
@@ -225,28 +288,41 @@ bool test_moe_sorting(ck_tile::ArgParser args)
                                                               num_experts,
                                                               unit_size,
                                                               local_expert_masking);
-        rtn &= ck_tile::check_err(
-            sorted_ids_host, sorted_ids_ref, std::string("OUT Error: Incorrect ids!"), 1e-6, 1e-6);
-        rtn &= ck_tile::check_err(sorted_weights_host,
-                                  sorted_weights_ref,
-                                  std::string("OUT Error: Incorrect w!"),
-                                  1e-6,
-                                  1e-6);
-        rtn &= ck_tile::check_err(sorted_expert_ids_host,
-                                  sorted_expert_ids_ref,
-                                  std::string("OUT Error: Incorrect eid!"),
-                                  1e-6,
-                                  1e-6);
+        printf("total_tokens_post_pad:%d(%d), ",
+               ref_total_tokens_post_pad,
+               sorted_id_cnt_host.mData[0]);
+        if(ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0])
+        {
+            size_t slen = ref_total_tokens_post_pad;
+            rtn &= ck_tile::check_err(sorted_ids_host.slice({0}, {slen}),
+                                      sorted_ids_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect ids!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_weights_host.slice({0}, {slen}),
+                                      sorted_weights_ref.slice({0}, {slen}),
+                                      std::string("OUT Error: Incorrect w!"),
+                                      1e-6,
+                                      1e-6);
+            rtn &= ck_tile::check_err(sorted_expert_ids_host.slice({0}, {slen / unit_size}),
+                                      sorted_expert_ids_ref.slice({0}, {slen / unit_size}),
+                                      std::string("OUT Error: Incorrect eid!"),
+                                      1e-6,
+                                      1e-6);
+        }
+        else
+        {
+            printf("(token size not equal!!)");
+            rtn = false;
+        }
+
         if(moe_buf_size)
         {
             ck_tile::HostTensor<WeightType> moe_buf_ref({moe_buf_size});
             rtn &= ck_tile::check_err(
                 moe_buf_host, moe_buf_ref, std::string("OUT Error: Incorrect zero buf!"), 0, 0);
         }
-        rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
-        printf("total_tokens_post_pad:%d(%d), ",
-               ref_total_tokens_post_pad,
-               sorted_id_cnt_host.mData[0]);
+        // rtn &= ref_total_tokens_post_pad == sorted_id_cnt_host.mData[0];
     }
 
     printf("valid:%s", rtn ? "y" : "n");
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
index abff24a669..109ec1b157 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.cpp
@@ -153,18 +153,106 @@ float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_confi
         }
         }
 #else
-        using index_t            = ck_tile::index_t;
-        using ms_weight_type     = float;
-        auto [r_, c_]            = ck_tile::moe_sorting_get_smem_row_col(a.tokens, a.num_experts);
-        auto sub_token_          = r_ - 2;
-        r_                       = (r_ - 2) / 8;
-        bool is_sub_token_onshot = a.tokens <= sub_token_;
+        if(moe_sorting_get_workspace_size(a.tokens, a.num_experts) != 0)
+        {
+            return moe_sorting_mp(t, a, s);
+        }
+        using index_t                = ck_tile::index_t;
+        using ms_weight_type         = float;
+        auto sub_token_              = ck_tile::moe_sorting_get_sub_token(a.tokens, a.num_experts);
+        auto row_                    = sub_token_ / 8;
+        bool is_sub_token_onshot     = a.tokens <= sub_token_;
         bool is_local_expert_masking = t.local_expert_masking;
-        (void)c_;
 
-        MOE_SORTING_DISPATCH_EMASK_(r_);
+        MOE_SORTING_DISPATCH_EMASK_(row_);
         // MOE_SORTING_DISPATCH_ETILE(0, 0);
 #endif
     }
     return -1;
 }
+
+#define MOE_SORTING_MP_0(unroll_num_, expert_masking_)                                            \
+    [&]() {                                                                                       \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
+        constexpr bool expert_masking         = expert_masking_;                                  \
+        using ms_problem =                                                                        \
+            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
+        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P0<ms_problem>;                   \
+        auto kargs        = kernel::MakeKargs(a);                                                 \
+        const dim3 grids  = kernel::GridSize(a);                                                  \
+        const dim3 blocks = kernel::BlockSize(a);                                                 \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+    }()
+
+#define MOE_SORTING_MP_1(unroll_num_, expert_masking_)                                            \
+    [&]() {                                                                                       \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
+        constexpr bool expert_masking         = expert_masking_;                                  \
+        using ms_problem =                                                                        \
+            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
+        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P1<ms_problem>;                   \
+        auto kargs        = kernel::MakeKargs(a);                                                 \
+        const dim3 grids  = kernel::GridSize(a);                                                  \
+        const dim3 blocks = kernel::BlockSize(a);                                                 \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+    }()
+
+#define MOE_SORTING_MP_2(unroll_num_, expert_masking_)                                            \
+    [&]() {                                                                                       \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
+        constexpr bool expert_masking         = expert_masking_;                                  \
+        using ms_problem =                                                                        \
+            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
+        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P2<ms_problem>;                   \
+        auto kargs        = kernel::MakeKargs(a);                                                 \
+        const dim3 grids  = kernel::GridSize(a);                                                  \
+        const dim3 blocks = kernel::BlockSize(a);                                                 \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+    }()
+
+#define MOE_SORTING_MP_3(unroll_num_, expert_masking_)                                            \
+    [&]() {                                                                                       \
+        constexpr ck_tile::index_t unroll_num = unroll_num_;                                      \
+        constexpr bool expert_masking         = expert_masking_;                                  \
+        using ms_problem =                                                                        \
+            ck_tile::MoeSortingProblemMp<ms_index_t, ms_weight_type, unroll_num, expert_masking>; \
+        using kernel      = ck_tile::MoeSortingMultiPhaseKernel_P3<ms_problem>;                   \
+        auto kargs        = kernel::MakeKargs(a);                                                 \
+        const dim3 grids  = kernel::GridSize(a);                                                  \
+        const dim3 blocks = kernel::BlockSize(a);                                                 \
+        return ck_tile::make_kernel(kernel{}, grids, blocks, 0, kargs);                           \
+    }()
+
+float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s)
+{
+    if(t.weight_type == "fp32" && t.index_type == "int32")
+    {
+        using ms_index_t     = ck_tile::index_t;
+        using ms_weight_type = float;
+
+        if(t.local_expert_masking)
+        {
+            float ave_time = ck_tile::launch_kernel(s,
+                                                    MOE_SORTING_MP_0(1, true),
+                                                    MOE_SORTING_MP_1(1, true),
+                                                    MOE_SORTING_MP_2(1, true),
+                                                    MOE_SORTING_MP_3(1, true));
+            return ave_time;
+        }
+        else
+        {
+            float ave_time = ck_tile::launch_kernel(s,
+                                                    MOE_SORTING_MP_0(1, false),
+                                                    MOE_SORTING_MP_1(1, false),
+                                                    MOE_SORTING_MP_2(1, false),
+                                                    MOE_SORTING_MP_3(1, false));
+            return ave_time;
+        }
+    }
+    return -1;
+}
+
+int moe_sorting_get_workspace_size(int tokens, int num_experts)
+{
+    return ck_tile::moe_sorting_get_workspace_size(tokens, num_experts);
+}
diff --git a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
index 5bda4d368a..b47ae9013b 100644
--- a/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
+++ b/example/ck_tile/13_moe_sorting/moe_sorting_api.hpp
@@ -18,4 +18,10 @@ struct moe_sorting_args : public ck_tile::MoeSortingHostArgs
 {
 };
 
+// use below API before call moe_sorting() to indicate if need workspace or not
+// if return non zero, means need workspace, you need to allocate a GPU buffer
+// and set to moe_sorting_args.p_ws
+// NOTE: workspace size are required to clear zero before use the API
+int moe_sorting_get_workspace_size(int tokens, int num_experts);
 float moe_sorting(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
+float moe_sorting_mp(moe_sorting_trait t, moe_sorting_args a, ck_tile::stream_config s);
diff --git a/include/ck/ck.hpp b/include/ck/ck.hpp
index 1ec0c6bc23..c8d1c20f4c 100644
--- a/include/ck/ck.hpp
+++ b/include/ck/ck.hpp
@@ -4,8 +4,9 @@
 #pragma once
 
 #include "ck/config.h"
+
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include "ck/utility/env.hpp"
-#ifndef CK_CODE_GEN_RTC
 #ifndef CK_DONT_USE_HIP_RUNTIME_HEADERS
 #include "hip/hip_runtime.h"
 #include "hip/hip_fp16.h"
diff --git a/include/ck/host_utility/device_prop.hpp b/include/ck/host_utility/device_prop.hpp
index 05dc491af7..e04e27b761 100644
--- a/include/ck/host_utility/device_prop.hpp
+++ b/include/ck/host_utility/device_prop.hpp
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#ifndef __HIPCC_RTC__
 #include <string>
 #include <map>
 #include <hip/hip_runtime.h>
@@ -97,3 +98,4 @@ inline bool is_gfx12_supported()
 }
 
 } // namespace ck
+#endif
diff --git a/include/ck/host_utility/kernel_launch.hpp b/include/ck/host_utility/kernel_launch.hpp
index 962f89e479..5c1c1c4e60 100644
--- a/include/ck/host_utility/kernel_launch.hpp
+++ b/include/ck/host_utility/kernel_launch.hpp
@@ -2,7 +2,7 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <hip/hip_runtime.h>
 
 #include "ck/ck.hpp"
@@ -166,3 +166,4 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
     return 0;
 #endif
 }
+#endif
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
index 526c4216fa..a2a552b521 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
index baae1c68a2..21ff8bd3eb 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_v1.hpp
@@ -141,6 +141,7 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
 
     using Base::AMmaKStride;
     using Base::BMmaKStride;
+    using Base::MWaves;
 
     static constexpr index_t PrefetchStages  = 2;
     static constexpr index_t PrefillStages   = 1;
@@ -182,9 +183,9 @@ struct BlockwiseGemmXdlops_pipeline_bpreshuffle_v1<BlockGemmPipelineScheduler::I
 
     __device__ static constexpr auto HotLoopScheduler()
     {
-        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num * 2;
+        constexpr auto num_ds_read_inst_a     = HotLoopInstList::A_LDS_Read_Inst_Num;
         constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
-        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * 2;
+        constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num * MWaves;
 
         // B global
         static_for<0, num_buffer_load_inst_b, 1>{}([&](auto i) {
diff --git a/include/ck/tensor_operation/gpu/device/device_base.hpp b/include/ck/tensor_operation/gpu/device/device_base.hpp
index 774982d905..9285211519 100644
--- a/include/ck/tensor_operation/gpu/device/device_base.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_base.hpp
@@ -3,11 +3,12 @@
 
 #pragma once
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <string>
 #include <sstream>
 #include <regex>
 #include <optional>
+
 #include "ck/stream_config.hpp"
 #endif
 
@@ -15,7 +16,7 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #define GET_OBJECT_NAME_IMLP                                                  \
     std::optional<std::string> GetObjectName() const override                 \
     {                                                                         \
@@ -77,7 +78,7 @@ struct BaseOperator
     BaseOperator()                    = default;
     BaseOperator(const BaseOperator&) = default;
     BaseOperator& operator=(const BaseOperator&) = default;
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
     virtual bool IsSupportedArgument(const BaseArgument*) { return false; }
     virtual std::string GetTypeString() const { return ""; }
 
diff --git a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
index 09259224e7..204b09cad4 100644
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
@@ -2,9 +2,10 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <vector>
+#endif
 
 #include "device_base.hpp"
 
@@ -28,6 +29,7 @@ template <typename ALayout,
           bool MaskOutUpperTriangle> // TODO: enum for mask type
 struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
 {
+#ifndef __HIPCC_RTC__
     virtual std::unique_ptr<BaseArgument>
     MakeArgumentPointer(const void* p_a,
                         const void* p_b0,
@@ -53,6 +55,7 @@ struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
                         CElementwiseOperation c_element_op) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+#endif
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
index 403a1cb085..c3f91fac9e 100644
--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d.hpp
@@ -2,9 +2,11 @@
 // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
-
+#ifndef __HIPCC_RTC__
 #include <array>
+#endif
 
+#include "ck/utility/array.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
 
 namespace ck {
@@ -34,6 +36,7 @@ struct DeviceGemmMultipleD : public BaseOperator
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
 
+#ifndef __HIPCC_RTC__
     virtual std::unique_ptr<BaseArgument>
     MakeArgumentPointer(const void* p_a,
                         const void* p_b,
@@ -51,6 +54,7 @@ struct DeviceGemmMultipleD : public BaseOperator
                         CDEElementwiseOperation cde_element_op) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+#endif
 };
 
 // GEMM:
@@ -76,6 +80,7 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
 {
     static constexpr index_t NumDTensor = DsDataType::Size();
 
+#ifndef __HIPCC_RTC__
     virtual std::unique_ptr<BaseArgument>
     MakeArgumentPointer(const void* p_a,
                         const void* p_b,
@@ -94,6 +99,7 @@ struct DeviceGemmMultipleDSplitK : public BaseOperator
                         CDEElementwiseOperation cde_element_op) = 0;
 
     virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+#endif
 };
 
 // GEMM:
@@ -141,6 +147,7 @@ struct DeviceGemmMultipleDSplitKBPreShuffle : public BaseOperator
     virtual int GetPreShuffleParameters() = 0;
 };
 
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
index 997dcb75a6..8824f44ec5 100644
--- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
@@ -28,8 +28,7 @@ enum struct GemmSpecialization
     NKOPadding,
     MNKOPadding,
 };
-
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 inline std::string getGemmSpecializationString(const GemmSpecialization& s)
 {
     switch(s)
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
index ea5a5d0e16..b4ab96d397 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
@@ -3,8 +3,12 @@
 
 #pragma once
 
+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <sstream>
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#endif
 
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -15,8 +19,6 @@
 #include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"
 
 namespace ck {
 namespace tensor_operation {
@@ -429,6 +431,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         matrix_padder.PadN,
         MaskOutUpperTriangle>;
 
+#ifndef __HIPCC_RTC__
     // Argument
     struct Argument : public BaseArgument
     {
@@ -603,6 +606,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
             return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
         }
     };
+#endif
 
     static constexpr bool IsValidCompilationParameter()
     {
@@ -610,6 +614,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
         return true;
     }
 
+#ifndef __HIPCC_RTC__
     static constexpr bool
     IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_, index_t Gemm1NRaw_)
     {
@@ -837,6 +842,7 @@ struct DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle
 
         return str.str();
     }
+#endif
 
     template <class ADesc, class BDesc, class B1Desc, class CDesc>
     struct Descriptor
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
index e6466a487b..3fae3a3765 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -3,8 +3,12 @@
 
 #pragma once
 
+#ifndef __HIPCC_RTC__
 #include <iostream>
 #include <sstream>
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#endif
 
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
@@ -14,8 +18,6 @@
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp"
-#include "ck/host_utility/device_prop.hpp"
-#include "ck/host_utility/kernel_launch.hpp"
 
 namespace ck {
 
@@ -224,9 +226,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
         return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
     }
 
-    static auto MakeDsGridDescriptor_M_N(const std::array<index_t, NumDTensor>& MRaws,
-                                         const std::array<index_t, NumDTensor>& NRaws,
-                                         const std::array<index_t, NumDTensor>& DsStride)
+    static auto MakeDsGridDescriptor_M_N(const Array<index_t, NumDTensor>& MRaws,
+                                         const Array<index_t, NumDTensor>& NRaws,
+                                         const Array<index_t, NumDTensor>& DsStride)
     {
         return generate_tuple(
             [&](auto i) {
@@ -308,6 +310,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
     using Block2ETileMap =
         remove_cvref_t<decltype(GridwiseGemm::MakeDefaultBlock2ETileMap(EGridDesc_M_N{}))>;
 
+#ifndef __HIPCC_RTC__
     // Argument
     struct Argument : public BaseArgument
     {
@@ -497,6 +500,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
         }
     };
 
+#endif
+
     static constexpr bool IsSupported(index_t MRaw_, index_t NRaw_, index_t KRaw_)
     {
         // check vector load/store
@@ -577,6 +582,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
         return true;
     }
 
+#ifndef __HIPCC_RTC__
     static bool IsSupportedArgument(const Argument& arg)
     {
         if(!ck::is_xdl_supported())
@@ -675,11 +681,13 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
     {
         auto str = std::stringstream();
 
-        std::map<LoopScheduler, std::string> LoopSchedToString{
-            {LoopScheduler::Default, "Default"}, {LoopScheduler::Interwave, "Interwave"}};
+        std::map<LoopScheduler, std::string> LoopSchedToString{{LoopScheduler::Default, "Default"},
+                                                               { LoopScheduler::Interwave,
+                                                                 "Interwave" }};
 
         std::map<PipelineVersion, std::string> PipelineVersionToString{{PipelineVersion::v1, "v1"},
-                                                                       {PipelineVersion::v2, "v2"}};
+                                                                       { PipelineVersion::v2,
+                                                                         "v2" }};
 
         // clang-format off
         str << "DeviceGemmMultipleD_Xdl_CShuffle"
@@ -708,6 +716,7 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
 
         return str.str();
     }
+#endif
 
     template <class ADesc, class BDesc, class DsDesc, class EDesc>
     struct Descriptor
@@ -846,7 +855,9 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
                                EDataType* __restrict__ p_e_grid)
     {
         __shared__ char p_shared_block[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+#ifndef __HIPCC_RTC__
         assert(desc.IsValid());
+#endif
         if(desc.has_main_k_block_loop)
         {
             GridwiseGemm::template Run<true>(p_a_grid,
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index ef87bb52ae..abd6a080aa 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -13,8 +13,10 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp"
 #include "ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp"
 #include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_bwd_weight.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -138,8 +140,10 @@ template <ck::index_t NDimSpatial,
           index_t CShuffleNXdlPerWavePerShuffle,
           typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
           index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
-          typename ComputeTypeA = InDataType,
-          typename ComputeTypeB = ComputeTypeA>
+          typename ComputeTypeA                          = InDataType,
+          typename ComputeTypeB                          = ComputeTypeA,
+          index_t MaxTransposeTransferSrcScalarPerVector = 1,
+          index_t MaxTransposeTransferDstScalarPerVector = 1>
 struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     : public DeviceGroupedConvBwdWeight<NDimSpatial,
                                         InLayout,
@@ -160,6 +164,11 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     using BDataType = InDataType;
     using CDataType = WeiDataType;
 
+    // If NGCHW then ADataType must be equal to BDataType
+    static_assert(!(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                    is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) ||
+                  is_same_v<ADataType, BDataType>);
+
     using AElementwiseOperation = OutElementwiseOperation;
     using BElementwiseOperation = InElementwiseOperation;
     using CElementwiseOperation = WeiElementwiseOperation;
@@ -279,6 +288,51 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
     using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
 
+    static constexpr index_t ClusterLengthMPerBlock =
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(1);
+    static constexpr index_t ClusterLengthNPerBlock =
+        CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock::At(3);
+
+    static constexpr auto conv_ngchw_to_nhwgc_transformer =
+        TransformConvNGCHWToNHWGC<InLayout,
+                                  WeiLayout,
+                                  OutLayout,
+                                  NDimSpatial,
+                                  MPerBlock / ClusterLengthMPerBlock,
+                                  NPerBlock / ClusterLengthNPerBlock>{};
+
+    using Block2TileMapElementwise = BlockToCTileMap_M00_N0_M01Adapt<MPerBlock, NPerBlock>;
+
+    static constexpr index_t TransposeTransferSrcScalarPerVectorAligned =
+        std::min(NPerBlock / ClusterLengthNPerBlock, MaxTransposeTransferSrcScalarPerVector);
+    static constexpr index_t TransposeTransferDstScalarPerVectorAligned =
+        std::min(MPerBlock / ClusterLengthMPerBlock, MaxTransposeTransferDstScalarPerVector);
+
+    using NGCHWTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeNGCHWTransposeDesc<NDimSpatial>({}, {}))>;
+    using NHWGCTransposeDescType =
+        remove_cvref_t<decltype(conv_ngchw_to_nhwgc_transformer
+                                    .template MakeNHWGCTransposeDesc<NDimSpatial>({}, {}))>;
+
+    using GridwiseElementwiseTranspose =
+        GridwiseElementwise<Tuple<NGCHWTransposeDescType>,
+                            Tuple<NHWGCTransposeDescType>,
+                            Tuple<const ADataType*>,
+                            Tuple<ADataType*>,
+                            Block2TileMapElementwise,
+                            element_wise::PassThrough,
+                            BlockSize,
+                            MPerBlock,
+                            NPerBlock,
+                            MPerBlock / ClusterLengthMPerBlock,
+                            NPerBlock / ClusterLengthNPerBlock,
+                            Sequence<1, 0>,
+                            Sequence<TransposeTransferSrcScalarPerVectorAligned>,
+                            Sequence<TransposeTransferDstScalarPerVectorAligned>,
+                            I1,
+                            I0>;
+
     using GridwiseGemm = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_bwd_weight<
         BlockSize,
         ADataType,
@@ -398,6 +452,13 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                       end(a_g_n_k_wos_lengths),
                       begin(output_spatial_lengths_));
 
+            std::array<index_t, NDimSpatial + 3> b_g_n_c_wis_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeStrides(b_g_n_c_wis_lengths,
+                                                                 b_g_n_c_wis_strides);
+            std::array<index_t, NDimSpatial + 3> a_g_n_k_wos_strides_transposed =
+                conv_ngchw_to_nhwgc_transformer.TransposeStrides(a_g_n_k_wos_lengths,
+                                                                 a_g_n_k_wos_strides);
+
             const auto descs =
                 conv_to_gemm_transformer
                     .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
@@ -407,9 +468,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                         input_spatial_lengths_,
                         filter_spatial_lengths_,
                         output_spatial_lengths_,
-                        b_g_n_c_wis_strides,
+                        b_g_n_c_wis_strides_transposed,
                         e_g_k_c_xs_strides,
-                        a_g_n_k_wos_strides,
+                        a_g_n_k_wos_strides_transposed,
                         conv_filter_strides,
                         conv_filter_dilations,
                         input_left_pads,
@@ -424,8 +485,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                 GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n_, M01, N01, k_batch_);
 
             // A/B/C Batch Stride
-            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
-            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides_transposed[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides_transposed[0];
             compute_ptr_offset_of_batch_.BatchStrideC_ =
                 Conv_K_ * Conv_C_ *
                 std::accumulate(begin(filter_spatial_lengths_),
@@ -441,6 +502,54 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                 c_grid_desc_mblock_mperblock_nblock_nperblock_ =
                     GridwiseGemm::MakeCGridDesc_MBlock_MPerBlock_NBlock_NPerBlock(c_grid_desc_m_n_);
             }
+
+            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                a_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
+                        a_g_n_k_wos_lengths, a_g_n_k_wos_strides);
+                a_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
+                        a_g_n_k_wos_lengths, a_g_n_k_wos_strides);
+
+                b_in_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNGCHWTransposeDesc<NDimSpatial>(
+                        b_g_n_c_wis_lengths, b_g_n_c_wis_strides);
+                b_out_transpose_desc_ =
+                    conv_ngchw_to_nhwgc_transformer.template MakeNHWGCTransposeDesc<NDimSpatial>(
+                        b_g_n_c_wis_lengths, b_g_n_c_wis_strides);
+
+                elementwise_block_2_ctile_map_transpose_a_ = Block2TileMapElementwise{
+                    a_in_transpose_desc_.GetLength(I0), a_in_transpose_desc_.GetLength(I1)};
+
+                elementwise_block_2_ctile_map_transpose_b_ = Block2TileMapElementwise{
+                    b_in_transpose_desc_.GetLength(I0), b_in_transpose_desc_.GetLength(I1)};
+            }
+        }
+
+        std::size_t GetWorkspaceATensorSizeBytes() const
+        {
+            return sizeof(ADataType) * a_in_transpose_desc_.GetElementSpaceSize();
+        }
+
+        std::size_t GetWorkspaceBTensorSizeBytes() const
+        {
+            return sizeof(BDataType) * b_in_transpose_desc_.GetElementSpaceSize();
+        }
+
+        std::size_t GetWorkspaceSizeBytes() const
+        {
+            // Transpose require workspace for A and B
+            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+            {
+                return GetWorkspaceATensorSizeBytes() + GetWorkspaceBTensorSizeBytes();
+            }
+            else
+            {
+                return 0;
+            }
         }
 
         const ADataType* p_a_grid_;
@@ -453,6 +562,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
 
         Block2CTileMap block_2_ctile_map_;
 
+        Block2TileMapElementwise elementwise_block_2_ctile_map_transpose_a_,
+            elementwise_block_2_ctile_map_transpose_b_;
+
+        NGCHWTransposeDescType a_in_transpose_desc_, b_in_transpose_desc_;
+        NHWGCTransposeDescType a_out_transpose_desc_, b_out_transpose_desc_;
+
         // for computing batch offset
         ComputePtrOffsetOfStridedBatch<> compute_ptr_offset_of_batch_;
 
@@ -502,13 +617,57 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
 
         float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
         {
-            if(!GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
-                                            arg.b_grid_desc_kbatch_k0_n_k1_,
-                                            arg.c_grid_desc_m_n_,
-                                            arg.block_2_ctile_map_))
+            float avg_time = 0.f;
+
+            const ADataType* p_a_grid = arg.p_a_grid_;
+            const BDataType* p_b_grid = arg.p_b_grid_;
+
+            if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                         is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
             {
-                throw std::runtime_error(
-                    "wrong! GridwiseGemm_km_kn_m0m1n0n1_xdlops_v3r1 has invalid setting");
+                const index_t grid_size_a =
+                    arg.elementwise_block_2_ctile_map_transpose_a_.CalculateGridSize(
+                        arg.a_in_transpose_desc_);
+                const index_t grid_size_b =
+                    arg.elementwise_block_2_ctile_map_transpose_b_.CalculateGridSize(
+                        arg.b_in_transpose_desc_);
+
+                p_a_grid = type_convert<const ADataType*>(arg.p_workspace_);
+                p_b_grid = type_convert<const BDataType*>(arg.p_workspace_) +
+                           arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+                ADataType* p_out_a_grid = type_convert<ADataType*>(arg.p_workspace_);
+                BDataType* p_out_b_grid = type_convert<BDataType*>(arg.p_workspace_) +
+                                          arg.GetWorkspaceATensorSizeBytes() / sizeof(BDataType);
+
+                // Different data type for A and B is not supported
+                auto kernel_transpose = kernel_elementwise_dual<GridwiseElementwiseTranspose,
+                                                                ck::Tuple<NGCHWTransposeDescType>,
+                                                                ck::Tuple<NGCHWTransposeDescType>,
+                                                                ck::Tuple<NHWGCTransposeDescType>,
+                                                                ck::Tuple<NHWGCTransposeDescType>,
+                                                                ck::Tuple<const ADataType*>,
+                                                                ck::Tuple<ADataType*>,
+                                                                Block2TileMapElementwise,
+                                                                Block2TileMapElementwise,
+                                                                element_wise::PassThrough>;
+
+                avg_time += launch_and_time_kernel(stream_config,
+                                                   kernel_transpose,
+                                                   dim3(grid_size_a + grid_size_b),
+                                                   dim3(BlockSize),
+                                                   0,
+                                                   make_tuple(arg.a_in_transpose_desc_),
+                                                   make_tuple(arg.b_in_transpose_desc_),
+                                                   make_tuple(arg.a_out_transpose_desc_),
+                                                   make_tuple(arg.b_out_transpose_desc_),
+                                                   make_tuple(arg.p_a_grid_),
+                                                   make_tuple(arg.p_b_grid_),
+                                                   make_tuple(p_out_a_grid),
+                                                   make_tuple(p_out_b_grid),
+                                                   arg.elementwise_block_2_ctile_map_transpose_a_,
+                                                   arg.elementwise_block_2_ctile_map_transpose_b_,
+                                                   element_wise::PassThrough{},
+                                                   grid_size_a);
             }
 
             const index_t grid_size =
@@ -536,33 +695,35 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                     ComputePtrOffsetOfStridedBatch<>,
                     has_main_loop>;
 
-                return launch_and_time_kernel(stream_config,
-                                              kernel,
-                                              dim3(grid_size),
-                                              dim3(BlockSize),
-                                              0,
-                                              arg.p_a_grid_,
-                                              arg.p_b_grid_,
-                                              arg.p_c_grid_,
-                                              arg.a_element_op_,
-                                              arg.b_element_op_,
-                                              arg.c_element_op_,
-                                              arg.Conv_G_,
-                                              arg.a_grid_desc_kbatch_k0_m_k1_,
-                                              arg.b_grid_desc_kbatch_k0_n_k1_,
-                                              arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
-                                              arg.block_2_ctile_map_,
-                                              arg.compute_ptr_offset_of_batch_);
+                avg_time +=
+                    launch_and_time_kernel(stream_config,
+                                           kernel,
+                                           dim3(grid_size),
+                                           dim3(BlockSize),
+                                           0,
+                                           p_a_grid,
+                                           p_b_grid,
+                                           arg.p_c_grid_,
+                                           arg.a_element_op_,
+                                           arg.b_element_op_,
+                                           arg.c_element_op_,
+                                           arg.Conv_G_,
+                                           arg.a_grid_desc_kbatch_k0_m_k1_,
+                                           arg.b_grid_desc_kbatch_k0_n_k1_,
+                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                           arg.block_2_ctile_map_,
+                                           arg.compute_ptr_offset_of_batch_);
             };
 
             if(has_main_k0_block_loop)
             {
-                return launch_kernel(integral_constant<bool, true>{});
+                launch_kernel(integral_constant<bool, true>{});
             }
             else
             {
-                return launch_kernel(integral_constant<bool, false>{});
+                launch_kernel(integral_constant<bool, false>{});
             }
+            return avg_time;
         }
 
         float Run(const BaseArgument* p_arg,
@@ -598,7 +759,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         else if constexpr(NDimSpatial == 2)
         {
             if constexpr(!(is_NHWGC_GKYXC_NHWGK<InLayout, WeiLayout, OutLayout>() ||
-                           is_GNHWC_GKYXC_GNHWK<InLayout, WeiLayout, OutLayout>()))
+                           is_GNHWC_GKYXC_GNHWK<InLayout, WeiLayout, OutLayout>() ||
+                           is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -606,7 +768,8 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
         else if constexpr(NDimSpatial == 3)
         {
             if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK<InLayout, WeiLayout, OutLayout>() ||
-                           is_GNDHWC_GKZYXC_GNDHWK<InLayout, WeiLayout, OutLayout>()))
+                           is_GNDHWC_GKZYXC_GNDHWK<InLayout, WeiLayout, OutLayout>() ||
+                           is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()))
             {
                 return false;
             }
@@ -644,6 +807,35 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             return false;
         }
 
+        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() ||
+                     is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>())
+        {
+            if((arg.Conv_G_ * arg.Conv_C_) % TransposeTransferDstScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+
+            if((arg.Conv_G_ * arg.Conv_K_) % TransposeTransferDstScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+
+            const index_t input_spatial_acum = ck::accumulate_n<index_t>(
+                arg.input_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+            const index_t output_spatial_acum = ck::accumulate_n<index_t>(
+                arg.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>());
+
+            if(input_spatial_acum % TransposeTransferSrcScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+
+            if(output_spatial_acum % TransposeTransferSrcScalarPerVectorAligned != 0)
+            {
+                return false;
+            }
+        }
+
         // Gridwise GEMM size
         return GridwiseGemm::CheckValidity(arg.a_grid_desc_kbatch_k0_m_k1_,
                                            arg.b_grid_desc_kbatch_k0_n_k1_,
@@ -764,12 +956,49 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
             << BBlockTransferDstScalarPerVector_K1 << ", "
             << CShuffleMXdlPerWavePerShuffle << ", "
             << CShuffleNXdlPerWavePerShuffle << ", "
-            << CBlockTransferScalarPerVector_NWaveNPerXdl
-            << ">";
+            << CBlockTransferScalarPerVector_NWaveNPerXdl;
+
+        if constexpr(is_NGCHW_GKYXC_NGKHW<InLayout, WeiLayout, OutLayout>() || 
+                        is_NGCDHW_GKZYXC_NGKDHW<InLayout, WeiLayout, OutLayout>()) {
+                str << ", TransposeTransferSrcScalarPerVectorAligned: "
+                << TransposeTransferSrcScalarPerVectorAligned <<", "
+                << "TransposeTransferDstScalarPerVectorAligned: " << TransposeTransferDstScalarPerVectorAligned;
+            }
+
+            
+            str << ">";
         // clang-format on
 
         return str.str();
     }
+
+    size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
+    {
+        auto arg = dynamic_cast<const Argument*>(p_arg);
+        if(arg)
+        {
+            return arg->GetWorkspaceSizeBytes();
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedConvBwdWeight_Xdl_CShuffle::Argument structure!");
+    }
+
+    void SetWorkSpacePointer(BaseArgument* p_arg,
+                             void* p_workspace,
+                             const StreamConfig& = StreamConfig{}) const override
+    {
+        auto p_arg_ = dynamic_cast<Argument*>(p_arg);
+        if(p_arg_)
+        {
+            p_arg_->p_workspace_ = p_workspace;
+        }
+        else
+            throw std::runtime_error(
+                "The argument pointer is not an object of "
+                "DeviceGroupedConvBwdWeight_Xdl_CShuffle::Argument structure!");
+    }
 };
 
 } // namespace device
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
new file mode 100644
index 0000000000..f6ea23a1e7
--- /dev/null
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -0,0 +1,1349 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <numeric>
+#include <sstream>
+
+#include "ck/utility/common_header.hpp"
+
+#include "ck/tensor_description/tensor_descriptor.hpp"
+#include "ck/tensor_description/tensor_descriptor_helper.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_grouped_conv_bwd_weight.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm.hpp"
+#include "ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_bwd_weight_v3.hpp"
+#include <ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp>
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
+
+#include "ck/host_utility/device_prop.hpp"
+#include "ck/host_utility/kernel_launch.hpp"
+#include "ck/host_utility/flush_cache.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename GridwiseGemm,
+          typename AGridDesc_AK0_M_K1,
+          typename BGridDesc_BK0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3(
+            typename GridwiseGemm::Argument karg,
+            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const index_t num_k_per_block)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
+
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+
+    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    GridwiseGemm::template Run<AGridDesc_AK0_M_K1,
+                               BGridDesc_BK0_N_K1,
+                               CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                               HasMainKBlockLoop,
+                               CGlobalMemoryDataOperation,
+                               TailNum>(karg.p_a_grid + a_batch_offset,
+                                        karg.p_b_grid + b_batch_offset,
+                                        karg.p_c_grid + e_batch_offset,
+                                        p_shared,
+                                        karg,
+                                        a_grid_desc_ak0_m_ak1,
+                                        b_grid_desc_bk0_n_bk1,
+                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                        k_idx);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__)
+}
+
+template <typename GridwiseGemm,
+          typename AGridDesc_AK0_M_K1,
+          typename BGridDesc_BK0_N_K1,
+          typename CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+          typename ComputePtrOffsetOfBatch,
+          bool HasMainKBlockLoop,
+          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
+          index_t MinimumOccupancy = 1,
+          TailNumber TailNum       = TailNumber::Full>
+__global__ void
+#if CK_USE_LAUNCH_BOUNDS
+    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, MinimumOccupancy)
+#endif
+        kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds(
+            typename GridwiseGemm::Argument karg,
+            const AGridDesc_AK0_M_K1 a_grid_desc_ak0_m_ak1,
+            const BGridDesc_BK0_N_K1 b_grid_desc_bk0_n_bk1,
+            const CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock
+                c_grid_desc_mblock_mperblock_nblock_nperblock,
+            const ComputePtrOffsetOfBatch compute_ptr_offset_of_batch,
+            const index_t num_k_per_block)
+{
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx9__))
+    // offset base pointer for each work-group
+    const index_t g_idx = __builtin_amdgcn_readfirstlane(blockIdx.z);
+    const index_t k_idx = __builtin_amdgcn_readfirstlane(blockIdx.y * num_k_per_block);
+
+    const long_index_t a_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx));
+    const long_index_t b_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetBPtrOffset(g_idx));
+    const long_index_t e_batch_offset =
+        amd_wave_read_first_lane(compute_ptr_offset_of_batch.GetEPtrOffset(g_idx));
+
+    // Pass two lds pointer is the key to tell compiler that ds_read/write
+    // operate on different lds chunk at same time without order dependecy
+    __shared__ char p_shared_0[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+    __shared__ char p_shared_1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
+
+    GridwiseGemm::template Run_2Lds<AGridDesc_AK0_M_K1,
+                                    BGridDesc_BK0_N_K1,
+                                    CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock,
+                                    HasMainKBlockLoop,
+                                    CGlobalMemoryDataOperation,
+                                    TailNum>(karg.p_a_grid + a_batch_offset,
+                                             karg.p_b_grid + b_batch_offset,
+                                             karg.p_c_grid + e_batch_offset,
+                                             p_shared_0,
+                                             p_shared_1,
+                                             karg,
+                                             a_grid_desc_ak0_m_ak1,
+                                             b_grid_desc_bk0_n_bk1,
+                                             c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                             k_idx);
+#else
+    ignore = karg;
+#endif // end of if (defined(__gfx9__)
+}
+
+// out[N, Ho, Wo, K] = in[N, Hi, Wi, C] * wei[K, Y, X, C]
+template <ck::index_t NDimSpatial,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename AccDataType,
+          typename InElementwiseOperation,
+          typename WeiElementwiseOperation,
+          typename OutElementwiseOperation,
+          ConvolutionBackwardWeightSpecialization ConvBackwardWeightSpecialization,
+          ck::index_t BlockSize,
+          ck::index_t MPerBlock,
+          ck::index_t NPerBlock,
+          ck::index_t K0PerBlock,
+          ck::index_t K1,
+          ck::index_t MPerXdl,
+          ck::index_t NPerXdl,
+          ck::index_t MXdlPerWave,
+          ck::index_t NXdlPerWave,
+          typename ABlockTransferThreadClusterLengths_K0_M_K1,
+          typename ABlockTransferThreadClusterArrangeOrder,
+          typename ABlockTransferSrcAccessOrder,
+          ck::index_t ABlockTransferSrcVectorDim,
+          ck::index_t ABlockTransferSrcScalarPerVector,
+          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
+          typename BBlockTransferThreadClusterLengths_K0_N_K1,
+          typename BBlockTransferThreadClusterArrangeOrder,
+          typename BBlockTransferSrcAccessOrder,
+          ck::index_t BBlockTransferSrcVectorDim,
+          ck::index_t BBlockTransferSrcScalarPerVector,
+          ck::index_t BBlockTransferDstScalarPerVector_K1,
+          bool BBlockLdsAddExtraN,
+          index_t CShuffleMXdlPerWavePerShuffle,
+          index_t CShuffleNXdlPerWavePerShuffle,
+          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+          index_t CBlockTransferScalarPerVector_NWaveNPerXdl,
+          BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave,
+          BlockGemmPipelineVersion BlkGemmPipelineVer = BlockGemmPipelineVersion::v1,
+          typename ComputeTypeA                       = InDataType,
+          typename ComputeTypeB                       = ComputeTypeA>
+struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
+    : public DeviceGroupedConvBwdWeight<NDimSpatial,
+                                        InLayout,
+                                        WeiLayout,
+                                        OutLayout,
+                                        InDataType,
+                                        WeiDataType,
+                                        OutDataType,
+                                        InElementwiseOperation,
+                                        WeiElementwiseOperation,
+                                        OutElementwiseOperation,
+                                        ComputeTypeA,
+                                        ComputeTypeB>
+{
+    static_assert(is_same_v<InElementwiseOperation, element_wise::PassThrough>);
+    static_assert(is_same_v<WeiElementwiseOperation, element_wise::PassThrough>);
+    static_assert(is_same_v<OutElementwiseOperation, element_wise::PassThrough>);
+
+    using DeviceOp = DeviceGroupedConvBwdWeight_Xdl_CShuffleV3;
+
+    using ADataType = OutDataType;
+    using BDataType = InDataType;
+    using CDataType = WeiDataType;
+
+    using AElementwiseOperation = OutElementwiseOperation;
+    using BElementwiseOperation = InElementwiseOperation;
+    using CElementwiseOperation = WeiElementwiseOperation;
+
+    // TODO make A/B datatype different
+    using ABDataType = InDataType;
+
+    static inline auto I0 = Number<0>{};
+    static inline auto I1 = Number<1>{};
+    static inline auto I2 = Number<2>{};
+    static inline auto I3 = Number<3>{};
+    static inline auto I4 = Number<4>{};
+    static inline auto I5 = Number<5>{};
+
+    static constexpr GemmSpecialization GemmSpec = GemmSpecialization::Default;
+    static constexpr auto K1Number               = Number<K1>{};
+
+    static constexpr auto conv_to_gemm_transformer =
+        TransformConvBwdWeightToGemmV2<NDimSpatial,
+                                       MPerBlock,
+                                       NPerBlock,
+                                       K1Number,
+                                       K0PerBlock / K1Number,
+                                       1 /*NumGroupsToMerge*/,
+                                       ConvBackwardWeightSpecialization>{};
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 1, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        const ck::index_t dim   = 1;
+        const ck::index_t batch = 1;
+        const std::array<ck::index_t, NDimSpatial> lengths{1};
+        const std::array<ck::index_t, NDimSpatial + 3> strides{1, 1, 1, 1};
+        const std::array<ck::index_t, NDimSpatial> params{1};
+        return conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<1>(
+            dim,
+            dim,
+            dim,
+            lengths,
+            lengths,
+            lengths,
+            strides,
+            strides,
+            strides,
+            params,
+            params,
+            params,
+            params,
+            batch);
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 2, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        const ck::index_t dim   = 1;
+        const ck::index_t batch = 1;
+        const std::array<ck::index_t, NDimSpatial> lengths{1, 1};
+        const std::array<ck::index_t, NDimSpatial + 3> strides{1, 1, 1, 1, 1};
+        const std::array<ck::index_t, NDimSpatial> params{1, 1};
+        return conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<2>(
+            dim,
+            dim,
+            dim,
+            lengths,
+            lengths,
+            lengths,
+            strides,
+            strides,
+            strides,
+            params,
+            params,
+            params,
+            params,
+            batch);
+    }
+
+    template <ck::index_t NDim, typename ck::enable_if<NDim == 3, bool>::type = false>
+    static auto GetABCGridDesc()
+    {
+        const ck::index_t dim   = 1;
+        const ck::index_t batch = 1;
+        const std::array<ck::index_t, NDimSpatial> lengths{1, 1, 1};
+        const std::array<ck::index_t, NDimSpatial + 3> strides{1, 1, 1, 1, 1, 1};
+        const std::array<ck::index_t, NDimSpatial> params{1, 1, 1};
+        return conv_to_gemm_transformer.template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<3>(
+            dim,
+            dim,
+            dim,
+            lengths,
+            lengths,
+            lengths,
+            strides,
+            strides,
+            strides,
+            params,
+            params,
+            params,
+            params,
+            batch);
+    }
+
+    using ABCGridDescs = decltype(GetABCGridDesc<NDimSpatial>());
+
+    using AGridDesc_K0_M_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I0])>;
+    using BGridDesc_K0_N_K1 = remove_cvref_t<decltype(ABCGridDescs{}[I1])>;
+    using CGridDesc_M_N     = remove_cvref_t<decltype(ABCGridDescs{}[I2])>;
+
+    using GridwiseGemm =
+        GridwiseGemm_xdl_cshuffle_v3<tensor_layout::gemm::RowMajor,
+                                     tensor_layout::gemm::ColumnMajor,
+                                     tensor_layout::gemm::RowMajor,
+                                     ADataType,
+                                     BDataType,
+                                     AccDataType,
+                                     CDataType,
+                                     CDataType,
+                                     AElementwiseOperation,
+                                     BElementwiseOperation,
+                                     CElementwiseOperation,
+                                     GemmSpec,
+                                     BlockSize,
+                                     MPerBlock,
+                                     NPerBlock,
+                                     K0PerBlock,
+                                     K1,
+                                     K1,
+                                     MPerXdl,
+                                     NPerXdl,
+                                     MXdlPerWave,
+                                     NXdlPerWave,
+                                     ABlockTransferThreadClusterLengths_K0_M_K1,
+                                     ABlockTransferThreadClusterArrangeOrder,
+                                     ABlockTransferSrcAccessOrder,
+                                     ABlockTransferSrcVectorDim,
+                                     ABlockTransferSrcScalarPerVector,
+                                     ABlockTransferDstScalarPerVector_K1,
+                                     false,
+                                     ABlockLdsAddExtraM,
+                                     BBlockTransferThreadClusterLengths_K0_N_K1,
+                                     BBlockTransferThreadClusterArrangeOrder,
+                                     BBlockTransferSrcAccessOrder,
+                                     BBlockTransferSrcVectorDim,
+                                     BBlockTransferSrcScalarPerVector,
+                                     BBlockTransferDstScalarPerVector_K1,
+                                     false,
+                                     BBlockLdsAddExtraN,
+                                     CShuffleMXdlPerWavePerShuffle,
+                                     CShuffleNXdlPerWavePerShuffle,
+                                     CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
+                                     CBlockTransferScalarPerVector_NWaveNPerXdl,
+                                     BlkGemmPipeSched,
+                                     BlkGemmPipelineVer,
+                                     ComputeTypeA,
+                                     ComputeTypeB>;
+
+    // Argument
+    using CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock =
+        decltype(GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+            CGridDesc_M_N{}, 1, 1));
+
+    struct Argument : public BaseArgument
+    {
+        Argument(const InDataType* p_in_grid,
+                 WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_lengths, // input
+                 const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_strides,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                 const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+                 const ck::index_t M01,
+                 const ck::index_t N01,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op,
+                 ck::index_t split_k)
+            : p_a_grid_{p_out_grid},
+              p_b_grid_{p_in_grid},
+              p_c_grid_{p_wei_grid},
+              a_grid_desc_kbatch_k0_m_k1_{},
+              b_grid_desc_kbatch_k0_n_k1_{},
+              c_grid_desc_m_n_{},
+              c_grid_desc_mblock_mperblock_nblock_nperblock_{},
+              compute_ptr_offset_of_batch_{},
+              M01_{M01},
+              N01_{N01},
+              a_element_op_{out_element_op},
+              b_element_op_{in_element_op},
+              c_element_op_{wei_element_op},
+              Conv_G_{b_g_n_c_wis_lengths[0]},
+              Conv_N_{b_g_n_c_wis_lengths[1]},
+              Conv_K_{e_g_k_c_xs_lengths[1]},
+              Conv_C_{b_g_n_c_wis_lengths[2]},
+              input_spatial_lengths_{},
+              filter_spatial_lengths_{},
+              output_spatial_lengths_{},
+              conv_filter_strides_{conv_filter_strides},
+              input_left_pads_{input_left_pads},
+              input_right_pads_{input_right_pads},
+              k_batch_{split_k}
+        {
+            constexpr index_t spatial_offset = 3;
+            std::copy(begin(b_g_n_c_wis_lengths) + spatial_offset,
+                      end(b_g_n_c_wis_lengths),
+                      begin(input_spatial_lengths_));
+            std::copy(begin(e_g_k_c_xs_lengths) + spatial_offset,
+                      end(e_g_k_c_xs_lengths),
+                      begin(filter_spatial_lengths_));
+            std::copy(begin(a_g_n_k_wos_lengths) + spatial_offset,
+                      end(a_g_n_k_wos_lengths),
+                      begin(output_spatial_lengths_));
+
+            const auto descs =
+                conv_to_gemm_transformer
+                    .template MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N<NDimSpatial>(
+                        Conv_N_,
+                        Conv_K_,
+                        Conv_C_,
+                        input_spatial_lengths_,
+                        filter_spatial_lengths_,
+                        output_spatial_lengths_,
+                        b_g_n_c_wis_strides,
+                        e_g_k_c_xs_strides,
+                        a_g_n_k_wos_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        k_batch_);
+
+            a_grid_desc_kbatch_k0_m_k1_ = descs[I0];
+            b_grid_desc_kbatch_k0_n_k1_ = descs[I1];
+            c_grid_desc_m_n_            = descs[I2];
+
+            // A/B/C Batch Stride
+            compute_ptr_offset_of_batch_.BatchStrideA_ = a_g_n_k_wos_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideB_ = b_g_n_c_wis_strides[0];
+            compute_ptr_offset_of_batch_.BatchStrideC_ =
+                Conv_K_ * Conv_C_ *
+                std::accumulate(begin(filter_spatial_lengths_),
+                                end(filter_spatial_lengths_),
+                                index_t{1},
+                                std::multiplies<>{});
+            const index_t GemmM = a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+            const index_t GemmN = b_grid_desc_kbatch_k0_n_k1_.GetLength(I1);
+
+            c_grid_desc_mblock_mperblock_nblock_nperblock_ =
+                GridwiseGemm::MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
+                    c_grid_desc_m_n_,
+                    GridwiseGemm::CalculateMBlock(GemmM),
+                    GridwiseGemm::CalculateNBlock(GemmN));
+        }
+
+        const ADataType* p_a_grid_;
+        const BDataType* p_b_grid_;
+        CDataType* p_c_grid_;
+        AGridDesc_K0_M_K1 a_grid_desc_kbatch_k0_m_k1_;
+        BGridDesc_K0_N_K1 b_grid_desc_kbatch_k0_n_k1_;
+        CGridDesc_M_N c_grid_desc_m_n_;
+        CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock c_grid_desc_mblock_mperblock_nblock_nperblock_;
+
+        // for computing batch offset
+        ComputePtrOffsetOfStridedBatch<I1, I1, I0> compute_ptr_offset_of_batch_;
+
+        index_t M01_;
+        index_t N01_;
+
+        OutElementwiseOperation a_element_op_;
+        InElementwiseOperation b_element_op_;
+        WeiElementwiseOperation c_element_op_;
+
+        // for checking IsSupportedArgument()
+        const index_t Conv_G_;
+        const index_t Conv_N_;
+        const index_t Conv_K_;
+        const index_t Conv_C_;
+        std::array<ck::index_t, NDimSpatial> input_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> filter_spatial_lengths_;
+        std::array<ck::index_t, NDimSpatial> output_spatial_lengths_;
+        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides_;
+        const std::array<ck::index_t, NDimSpatial>& input_left_pads_;
+        const std::array<ck::index_t, NDimSpatial>& input_right_pads_;
+        const index_t k_batch_;
+    };
+
+    // Invoker
+    struct Invoker : public BaseInvoker
+    {
+        using Argument = DeviceOp::Argument;
+
+        void ShowInfo(const Argument& arg)
+        {
+            std::cout << "arg.a_grid_desc_kbatch_k0_m_k1_{"
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2) << ", "
+                      << arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.b_grid_desc_kbatch_k0_n_k1_{"
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I0) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I2) << ", "
+                      << arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I3) << "}" << std::endl;
+
+            std::cout << "arg.c_grid_desc_m_n_{" << arg.c_grid_desc_m_n_.GetLength(I0) << ", "
+                      << arg.c_grid_desc_m_n_.GetLength(I1) << "}" << std::endl;
+        }
+
+        float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
+        {
+            const index_t GemmM = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+            const index_t GemmN = arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1);
+            const index_t GemmK = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) *
+                                  arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2);
+
+            const ADataType* p_a_grid = arg.p_a_grid_;
+            const BDataType* p_b_grid = arg.p_b_grid_;
+            typename GridwiseGemm::Argument gemm_arg{
+                p_a_grid, p_b_grid, arg.p_c_grid_, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+            index_t gdx, gdy, gdz;
+            std::tie(gdx, gdy, gdz) = GridwiseGemm::CalculateGridSize(
+                gemm_arg.M, gemm_arg.N, gemm_arg.KBatch, arg.Conv_G_);
+
+            float ave_time = 0;
+
+            index_t k_grain                  = gemm_arg.KBatch * K0PerBlock;
+            index_t K_split                  = (gemm_arg.K + k_grain - 1) / k_grain * K0PerBlock;
+            const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split);
+
+            const auto num_k_per_block =
+                arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(Number<0>{}) / gemm_arg.KBatch;
+
+            const auto Run = [&](const auto& kernel) {
+                if(stream_config.flush_cache)
+                {
+                    typename GridwiseGemm::Argument gemm_arg_ = gemm_arg;
+                    ck::utility::RotatingMemWrapper<typename GridwiseGemm::Argument> rotating_mem(
+                        gemm_arg_,
+                        stream_config.rotating_count,
+                        gemm_arg_.M * gemm_arg_.K * sizeof(ADataType),
+                        gemm_arg_.K * gemm_arg_.N * sizeof(BDataType));
+                    rotating_mem.Print();
+
+                    auto run_flush_cache = [&]() {
+                        // flush icache
+                        ck::utility::flush_icache();
+                        // rotating mem
+                        rotating_mem.Next();
+                    };
+                    ave_time += ck::utility::launch_and_time_kernel_with_preprocess<false>(
+                        stream_config,
+                        run_flush_cache,
+                        kernel,
+                        dim3(gdx, gdy, gdz),
+                        dim3(BlockSize),
+                        0,
+                        gemm_arg_,
+                        arg.a_grid_desc_kbatch_k0_m_k1_,
+                        arg.b_grid_desc_kbatch_k0_n_k1_,
+                        arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                        arg.compute_ptr_offset_of_batch_,
+                        num_k_per_block);
+                }
+                else
+                {
+                    ave_time +=
+                        launch_and_time_kernel(stream_config,
+                                               kernel,
+                                               dim3(gdx, gdy, gdz),
+                                               dim3(BlockSize),
+                                               0,
+                                               gemm_arg,
+                                               arg.a_grid_desc_kbatch_k0_m_k1_,
+                                               arg.b_grid_desc_kbatch_k0_n_k1_,
+                                               arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
+                                               arg.compute_ptr_offset_of_batch_,
+                                               num_k_per_block);
+                }
+            };
+
+            constexpr index_t minimum_occupancy =
+                BlkGemmPipeSched == BlockGemmPipelineScheduler::Intrawave ? 1 : 2;
+
+            if(has_main_k_block_loop)
+            {
+                // Tail number always full
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1 ||
+                             BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                            remove_reference_t<
+                                DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            true,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                            remove_reference_t<
+                                DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            true,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+                // Tail number could be One to Seven
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::AtomicAdd,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::One)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::One>;
+                            Run(kernel);
+                        }
+                        else if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                                TailNumber::Full)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Full>;
+                            Run(kernel);
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 2)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Two)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Two>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 3)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Three)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Three>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 4)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Four)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Four>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 5)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Five)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Five>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 6)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Six)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Six>;
+                                Run(kernel);
+                            }
+                        }
+
+                        if constexpr(GridwiseGemm::BlockwiseGemmPipe::PrefetchStages > 7)
+                        {
+                            if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) ==
+                               TailNumber::Seven)
+                            {
+                                const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                    GridwiseGemm,
+                                    remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                    remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                    remove_reference_t<
+                                        DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                    ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                    true,
+                                    InMemoryDataOperationEnum::Set,
+                                    minimum_occupancy,
+                                    TailNumber::Seven>;
+                                Run(kernel);
+                            }
+                        }
+                    }
+                }
+
+                // Tail number could be Odd or Even
+                else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v4)
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3_2lds<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+                else
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::AtomicAdd,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                    else
+                    {
+                        if(GridwiseGemm::CalculateKBlockLoopTailNum(K_split) == TailNumber::Odd)
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Odd>;
+                            Run(kernel);
+                        }
+                        else
+                        {
+                            const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                                GridwiseGemm,
+                                remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                                remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                                remove_reference_t<
+                                    DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                                ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                                true,
+                                InMemoryDataOperationEnum::Set,
+                                minimum_occupancy,
+                                TailNumber::Even>;
+                            Run(kernel);
+                        }
+                    }
+                }
+            }
+            else
+            {
+                // Tail number always 1
+                if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v1)
+                {
+                    if(gemm_arg.KBatch > 1)
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                            remove_reference_t<
+                                DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            false,
+                            InMemoryDataOperationEnum::AtomicAdd,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                    else
+                    {
+                        const auto kernel = kernel_grouped_conv_bwd_weight_xdl_cshuffle_v3<
+                            GridwiseGemm,
+                            remove_reference_t<DeviceOp::AGridDesc_K0_M_K1>,
+                            remove_reference_t<DeviceOp::BGridDesc_K0_N_K1>,
+                            remove_reference_t<
+                                DeviceOp::CGridDesc_MBlock_MPerBlock_NBlock_NPerBlock>,
+                            ComputePtrOffsetOfStridedBatch<I1, I1, I0>,
+                            false,
+                            InMemoryDataOperationEnum::Set,
+                            minimum_occupancy>;
+                        Run(kernel);
+                    }
+                }
+            }
+            return ave_time;
+        }
+
+        float Run(const BaseArgument* p_arg,
+                  const StreamConfig& stream_config = StreamConfig{}) override
+        {
+            return Run(*dynamic_cast<const Argument*>(p_arg), stream_config);
+        }
+    };
+
+    static constexpr bool IsValidCompilationParameter()
+    {
+        // TODO: properly implement this check
+        return true;
+    }
+
+    static bool IsSupportedArgument(const Argument& arg)
+    {
+        const index_t GemmM = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I1);
+        const index_t GemmN = arg.b_grid_desc_kbatch_k0_n_k1_.GetLength(I1);
+        const index_t GemmK = arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I0) *
+                              arg.a_grid_desc_kbatch_k0_m_k1_.GetLength(I2);
+
+        typename GridwiseGemm::Argument gemm_arg{
+            nullptr, nullptr, nullptr, GemmM, GemmN, GemmK, I0, I0, I0, arg.k_batch_};
+
+        const auto num_k_loop = gemm_arg.AK0 / (K0PerBlock / K1);
+        if constexpr(BlkGemmPipelineVer != BlockGemmPipelineVersion::v1)
+        {
+            if(num_k_loop <= GridwiseGemm::BlockwiseGemmPipe::PrefetchStages)
+            {
+                return false;
+            }
+        }
+
+        if(!ck::is_xdl_supported())
+        {
+            return false;
+        }
+
+        if(!is_bf16_atomic_supported() && std::is_same_v<CDataType, ck::bhalf_t> &&
+           arg.k_batch_ > 1)
+        {
+            return false;
+        }
+
+        if constexpr(NDimSpatial == 1)
+        {
+            if constexpr(!is_GNWC_GKXC_GNWK<InLayout, WeiLayout, OutLayout>())
+            {
+                return false;
+            }
+        }
+        else if constexpr(NDimSpatial == 2)
+        {
+            if constexpr(!(is_NHWGC_GKYXC_NHWGK<InLayout, WeiLayout, OutLayout>() ||
+                           is_GNHWC_GKYXC_GNHWK<InLayout, WeiLayout, OutLayout>()))
+            {
+                return false;
+            }
+        }
+        else if constexpr(NDimSpatial == 3)
+        {
+            if constexpr(!(is_NDHWGC_GKZYXC_NDHWGK<InLayout, WeiLayout, OutLayout>() ||
+                           is_GNDHWC_GKZYXC_GNDHWK<InLayout, WeiLayout, OutLayout>()))
+            {
+                return false;
+            }
+        }
+        else
+        {
+            return false;
+        }
+
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // check if it's 1x1, stride=1 pad = 0 conv
+            for(int i = 0; i < NDimSpatial; i++)
+            {
+                if(!(arg.filter_spatial_lengths_[i] == 1 && arg.conv_filter_strides_[i] == 1 &&
+                     arg.input_left_pads_[i] == 0 && arg.input_right_pads_[i] == 0))
+                {
+                    return false;
+                }
+            }
+        }
+        if(!(ABlockTransferSrcVectorDim == 1 && BBlockTransferSrcVectorDim == 1 &&
+             arg.Conv_K_ % ABlockTransferSrcScalarPerVector == 0 &&
+             arg.Conv_C_ % BBlockTransferSrcScalarPerVector == 0))
+        {
+            return false;
+        }
+
+        // vector store C matrix into global memory
+        if(!(arg.Conv_C_ % CBlockTransferScalarPerVector_NWaveNPerXdl == 0))
+        {
+            return false;
+        }
+
+        // Gridwise GEMM size
+        return true;
+    }
+
+    bool IsSupportedArgument(const BaseArgument* p_arg) override
+    {
+        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
+    }
+
+    static auto
+    MakeArgument(const InDataType* p_in_grid,
+                 WeiDataType* p_wei_grid,
+                 const OutDataType* p_out_grid,
+                 const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_lengths, // input
+                 const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_strides,
+                 const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_lengths, // weight
+                 const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_strides,
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output
+                 const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
+                 const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                 const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                 const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+                 InElementwiseOperation in_element_op,
+                 WeiElementwiseOperation wei_element_op,
+                 OutElementwiseOperation out_element_op,
+                 const ck::index_t split_k)
+    {
+        return Argument{p_in_grid,
+                        p_wei_grid,
+                        p_out_grid,
+                        b_g_n_c_wis_lengths, // input
+                        b_g_n_c_wis_strides,
+                        e_g_k_c_xs_lengths, // weight
+                        e_g_k_c_xs_strides,
+                        a_g_n_k_wos_lengths, // output
+                        a_g_n_k_wos_strides,
+                        conv_filter_strides,
+                        conv_filter_dilations,
+                        input_left_pads,
+                        input_right_pads,
+                        1,
+                        1,
+                        in_element_op,
+                        wei_element_op,
+                        out_element_op,
+                        split_k};
+    }
+
+    static auto MakeInvoker() { return Invoker{}; }
+
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_in_grid,
+                        void* p_wei_grid,
+                        const void* p_out_grid,
+                        const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_lengths, // input
+                        const std::array<index_t, NDimSpatial + 3>& b_g_n_c_wis_strides,
+                        const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_lengths, // weight
+                        const std::array<index_t, NDimSpatial + 3>& e_g_k_c_xs_strides,
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_lengths, // output
+                        const std::array<index_t, NDimSpatial + 3>& a_g_n_k_wos_strides,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_strides,
+                        const std::array<ck::index_t, NDimSpatial>& conv_filter_dilations,
+                        const std::array<ck::index_t, NDimSpatial>& input_left_pads,
+                        const std::array<ck::index_t, NDimSpatial>& input_right_pads,
+                        InElementwiseOperation in_element_op,
+                        WeiElementwiseOperation wei_element_op,
+                        OutElementwiseOperation out_element_op,
+                        const ck::index_t split_k) override
+    {
+        return std::make_unique<Argument>(static_cast<const InDataType*>(p_in_grid),
+                                          static_cast<WeiDataType*>(p_wei_grid),
+                                          static_cast<const OutDataType*>(p_out_grid),
+                                          b_g_n_c_wis_lengths, // input
+                                          b_g_n_c_wis_strides,
+                                          e_g_k_c_xs_lengths, // weight
+                                          e_g_k_c_xs_strides,
+                                          a_g_n_k_wos_lengths, // output
+                                          a_g_n_k_wos_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          1,
+                                          1,
+                                          in_element_op,
+                                          wei_element_op,
+                                          out_element_op,
+                                          split_k);
+    }
+
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>(Invoker{});
+    }
+
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+
+        // clang-format off
+        str << "DeviceGroupedConvBwdWeight_Xdl_CShuffleV3"
+            << "<"
+            << BlockSize << ", "
+            << MPerBlock << ", "
+            << NPerBlock << ", "
+            << K0PerBlock << ", "
+            << getConvBackwardWeightSpecializationString(ConvBackwardWeightSpecialization) << ", "
+            << K1 << ", "
+            << MXdlPerWave << ", "
+            << NXdlPerWave << ", "
+            << ABlockTransferSrcScalarPerVector << ", "
+            << ABlockTransferDstScalarPerVector_K1 << ", "
+            << BBlockTransferSrcScalarPerVector << ", "
+            << BBlockTransferDstScalarPerVector_K1 << ", "
+            << CShuffleMXdlPerWavePerShuffle << ", "
+            << CShuffleNXdlPerWavePerShuffle << ", "
+            << CBlockTransferScalarPerVector_NWaveNPerXdl
+            << ">";
+        // clang-format on
+
+        return str.str();
+    }
+};
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
index 0ec55984bc..9fe2f0d976 100644
--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
@@ -13,6 +13,7 @@ enum struct MaskingSpecialization
     MaskOutUpperTriangle
 };
 
+#ifndef __HIPCC_RTC__
 inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
 {
     switch(s)
@@ -22,6 +23,7 @@ inline std::string getMaskingSpecializationString(const MaskingSpecialization& s
     default: return "Unrecognized specialization!";
     }
 }
+#endif
 
 struct MaskDisabledPredicate
 {
@@ -53,7 +55,7 @@ struct MaskOutUpperTrianglePredicate
 template <typename MaskOutPredicate>
 struct C0MatrixMask_impl
 {
-    __host__ __device__ C0MatrixMask_impl(index_t NRaw)
+    __host__ __device__ constexpr C0MatrixMask_impl(index_t NRaw)
         : NRaw_(NRaw), predicate_(MaskOutPredicate{})
     {
     }
diff --git a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
index 4a44177838..e836e73a1d 100644
--- a/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
+++ b/include/ck/tensor_operation/gpu/device/tensor_layout.hpp
@@ -436,7 +436,7 @@ struct G_NDHW : public BaseTensorLayout
 
 } // namespace convolution
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <
     typename Layout,
     typename std::enable_if<std::is_base_of<BaseTensorLayout, Layout>::value, bool>::type = false>
diff --git a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
index 754967dc4d..17ea38aa0d 100644
--- a/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp
@@ -719,7 +719,7 @@ struct FastGelu
 
     template <typename Y, typename X>
     __device__ void operator()(Y& y, const X& x) const;
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
     template <>
     __host__ void operator()<float, float>(float& y, const float& x) const
     {
@@ -731,7 +731,6 @@ struct FastGelu
         y               = x / (1.f + emu);
     }
 #endif
-
     // device code, use lower precision "__ocml_exp_f32" and "rcp"
     template <>
     __device__ void operator()<float, float>(float& y, const float& x) const
diff --git a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
index 2bc9ef87ac..64fad1ca48 100644
--- a/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
+++ b/include/ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp
@@ -8,7 +8,7 @@
 #include "ck/utility/tuple.hpp"
 #include "ck/tensor_description/tensor_adaptor.hpp"
 #include "ck/tensor_description/multi_index_transform_helper.hpp"
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <limits>
 #include <stdlib.h>
 #endif
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index eb1eb533d7..060f6d5d15 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -473,7 +473,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
         return matrix_padder.PadCDescriptor_M_N(e_grid_desc_mraw_nraw);
     }
 
-#ifdef CK_CODE_GEN_RTC
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
     template <typename DsLayout, GemmSpecialization GemmSpec>
     __host__ __device__ static auto
     MakeDsGridDescriptor_M_N(const ck::Array<index_t, NumDTensor>& MRaws,
@@ -486,6 +486,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                              const std::array<index_t, NumDTensor>& NRaws,
                              const std::array<index_t, NumDTensor>& DsStride)
 #endif
+
     {
         return generate_tuple(
             [&](auto i) {
@@ -949,7 +950,7 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
                                const index_t K,
                                const index_t StrideA,
                                const index_t StrideB,
-#ifdef CK_CODE_GEN_RTC
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
                                const ck::Array<index_t, NumDTensor> StrideDs,
 #else
                                const std::array<index_t, NumDTensor> StrideDs,
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
index 9dad66913a..f8de0a48e5 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_pipeline_selector.hpp
@@ -2,7 +2,8 @@
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
-#ifndef CK_CODE_GEN_RTC
+
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <iostream>
 #include <ostream>
 #endif
@@ -54,7 +55,7 @@ constexpr auto GridwiseGemmPipeline_Selector()
     }
     else
     {
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
         std::cerr << "GridwiseGemmPipeline configuration is not available" << std::endl;
 #endif
     }
@@ -62,7 +63,7 @@ constexpr auto GridwiseGemmPipeline_Selector()
 
 } // namespace ck
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 inline std::ostream& operator<<(std::ostream& os, const ck::PipelineVersion& p)
 {
     switch(p)
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index e04f24c989..fcb12f4a14 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -230,6 +230,23 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             }
         }();
 
+        // Pad both M and K to be multiples of the block sizes
+        const auto a_grid_desc_m_k =
+            transform_tensor_descriptor(a_grid_desc_mraw_kraw,
+                                        make_tuple(make_right_pad_transform(M, MPad - M),
+                                                   make_right_pad_transform(K, KPad - K)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
+            a_grid_desc_m_k,
+            make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
+                       make_pass_through_transform(MPad)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+        return a_grid_desc_ak0_m_ak1;
+#if 0
         using GemmSpecialization = tensor_operation::device::GemmSpecialization;
 
         if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
@@ -296,6 +313,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
 
             return a_grid_desc_ak0_m_ak1;
         }
+#endif
     }
 
     __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
@@ -312,6 +330,23 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             }
         }();
 
+        // Pad both N and K to be multiples of the block sizes
+        const auto b_grid_desc_n_k =
+            transform_tensor_descriptor(b_grid_desc_nraw_kraw,
+                                        make_tuple(make_right_pad_transform(N, NPad - N),
+                                                   make_right_pad_transform(K, KPad - K)),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                        make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+        const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
+            b_grid_desc_n_k,
+            make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
+                       make_pass_through_transform(NPad)),
+            make_tuple(Sequence<1>{}, Sequence<0>{}),
+            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+        return b_grid_desc_bk0_n_bk1;
+#if 0     
         using GemmSpecialization = tensor_operation::device::GemmSpecialization;
 
         if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
@@ -378,6 +413,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
 
             return b_grid_desc_bk0_n_bk1;
         }
+#endif
     }
 
     template <typename ABlockDesc_AK0_M_AK1>
@@ -412,6 +448,13 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             }
         }();
 
+        // Pad both M and N to be multiples of the block sizes
+        return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
+                                           make_tuple(make_right_pad_transform(M, MPad - M),
+                                                      make_right_pad_transform(N, NPad - N)),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                           make_tuple(Sequence<0>{}, Sequence<1>{}));
+#if 0
         using GemmSpecialization = tensor_operation::device::GemmSpecialization;
 
         if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
@@ -449,6 +492,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             // not pad M or N
             return c_grid_desc_mraw_nraw;
         }
+#endif
     }
 
     struct Problem
@@ -953,7 +997,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::MPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     !(is_same<tensor_layout::gemm::RowMajor, ALayout>::value))
         {
             if(!(karg.M % MPerBlock == 0))
             {
@@ -970,7 +1015,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
         if constexpr(!(GemmSpec == tensor_operation::device::GemmSpecialization::NPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::MNPadding ||
                        GemmSpec == tensor_operation::device::GemmSpecialization::NKPadding ||
-                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding))
+                       GemmSpec == tensor_operation::device::GemmSpecialization::MNKPadding) &&
+                     (is_same<tensor_layout::gemm::RowMajor, BLayout>::value))
         {
             if(!(karg.N % NPerBlock == 0))
             {
@@ -1036,6 +1082,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                               << ABlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
                               << __LINE__ << ", in function: " << __func__ << std::endl;
                 }
+
                 return false;
             }
         }
@@ -1051,6 +1098,10 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                               << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
                               << __LINE__ << ", in function: " << __func__ << std::endl;
                 }
+                std::cout << "Arg N (" << karg.N
+                          << ") value is not a multiple of BBlockTransferSrcScalarPerVector ("
+                          << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
+                          << __LINE__ << ", in function: " << __func__ << std::endl;
                 return false;
             }
         }
@@ -1065,6 +1116,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                               << BBlockTransferSrcScalarPerVector << " )! " << __FILE__ << ":"
                               << __LINE__ << ", in function: " << __func__ << std::endl;
                 }
+
                 return false;
             }
         }
@@ -1082,6 +1134,7 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                               << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
                               << std::endl;
                 }
+
                 return false;
             }
         }
@@ -1098,17 +1151,8 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                               << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
                               << std::endl;
                 }
-                return false;
-            }
-        }
 
-        if constexpr(is_same<remove_cvref_t<CDataType>, bhalf_t>::value)
-        {
-            if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
-            {
-                std::cout << " Grid size: " << karg.Grid_size << " > 1 is not support yet"
-                          << __FILE__ << ":" << __LINE__ << ", in function: " << __func__
-                          << std::endl;
+                return false;
             }
         }
 
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
old mode 100644
new mode 100755
diff --git a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
index 4f20487b9b..8c0b950941 100644
--- a/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/xdlops_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -780,7 +780,6 @@ struct mfma_type<MfmaInstr::mfma_f32_16x16x32bf8f8>
     }
 };
 
-// TODO: fix mfma...f8f6f4 instructions
 template <>
 struct mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>
 {
@@ -847,9 +846,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>
     // clang-format on
 
     template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
-    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    __device__ void run(const FloatA& a,
+                        const int32_t scale_a,
+                        const FloatB& b,
+                        const int32_t scale_b,
+                        FloatC& reg_c) const
     {
-        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+        intrin_mfma_scale_f32_32x32x64f8f6f4<MPerXdlops, NPerXdlops>::Run(
+            a, scale_a, b, scale_b, reg_c);
     }
 };
 
@@ -871,9 +875,14 @@ struct mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>
     // clang-format on
 
     template <index_t MPerXdlops, index_t NPerXdlops, class FloatA, class FloatB, class FloatC>
-    __device__ void run(const FloatA& a, const FloatB& b, FloatC& reg_c) const
+    __device__ void run(const FloatA& a,
+                        const int32_t scale_a,
+                        const FloatB& b,
+                        const int32_t scale_b,
+                        FloatC& reg_c) const
     {
-        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(a, b, reg_c);
+        intrin_mfma_scale_f32_16x16x128f8f6f4<MPerXdlops, NPerXdlops>::Run(
+            a, scale_a, b, scale_b, reg_c);
     }
 };
 
diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
index bc290d5641..f34e0e59b3 100644
--- a/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_conv_bwd_weight_to_gemm_v2.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -34,6 +34,94 @@ struct TransformConvBwdWeightToGemmV2
     static constexpr auto I0 = Number<0>{};
     static constexpr auto I1 = Number<1>{};
 
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    constexpr static auto
+    make_out_grid_desc(const index_t N,
+                       const index_t Wo,
+                       const index_t K,
+                       const std::array<index_t, NDimSpatial + 3>& output_strides)
+    {
+        const index_t BatchStride = output_strides[0];
+        const index_t WoStride    = output_strides[3];
+        const auto KStride        = Number<1>{};
+        return make_naive_tensor_descriptor(make_tuple(N * Wo, NumGroupsToMerge, K),
+                                            make_tuple(WoStride, BatchStride, KStride));
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    constexpr static auto
+    make_in_grid_desc(const index_t N,
+                      const index_t Wi,
+                      const index_t C,
+                      const std::array<index_t, NDimSpatial + 3>& input_strides)
+    {
+        const index_t BatchStride = input_strides[0];
+        const index_t NStride     = input_strides[1];
+        const index_t WiStride    = input_strides[3];
+        const auto CStride        = input_strides[2];
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            return make_naive_tensor_descriptor(make_tuple(N * Wi, NumGroupsToMerge, C),
+                                                make_tuple(WiStride, BatchStride, CStride));
+        }
+        else
+        {
+            return make_naive_tensor_descriptor(
+                make_tuple(N, Wi, NumGroupsToMerge, C),
+                make_tuple(NStride, WiStride, BatchStride, CStride));
+        }
+    }
+
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    constexpr static auto
+    make_wei_grid_desc(const index_t K,
+                       const index_t X,
+                       const index_t C,
+                       const std::array<index_t, NDimSpatial + 3>& weights_strides)
+    {
+        const auto CStride     = Number<1>{};
+        const auto KStride     = weights_strides[1];
+        const auto XStride     = weights_strides[3];
+        const auto BatchStride = weights_strides[0];
+        // Add NumGroupsToMerge for Batch+M dimension and, 1 as a placehorder
+        // for Batch+N dimension
+        const auto desc = make_naive_tensor_descriptor(
+            make_tuple(NumGroupsToMerge, K, X, 1, C),
+            make_tuple(BatchStride, KStride, XStride, BatchStride, CStride));
+        // Padd 1 to NumGroupsToMerge
+        const auto padded_desc = transform_tensor_descriptor(
+            desc,
+            make_tuple(make_pass_through_transform(NumGroupsToMerge),
+                       make_pass_through_transform(K),
+                       make_pass_through_transform(X),
+                       make_pad_transform(1, 0, NumGroupsToMerge - 1),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}));
+        // We need only matrices from diagonal. Xor returns 0 for the same
+        // values. So if matrices is not on diagonal then it will be stored in padding.
+        // To avoid use of modulo after xor we assume that NumBatch to merge is power of 2.
+        static_assert(NumGroupsToMerge == 1 || NumGroupsToMerge == 2 || NumGroupsToMerge == 4 ||
+                      NumGroupsToMerge == 8 || NumGroupsToMerge == 16 || NumGroupsToMerge == 32 ||
+                      NumGroupsToMerge == 64);
+        const auto unmerged_padded_desc = transform_tensor_descriptor(
+            padded_desc,
+            make_tuple(make_xor_transform(make_tuple(NumGroupsToMerge, NumGroupsToMerge)),
+                       make_pass_through_transform(K),
+                       make_pass_through_transform(X),
+                       make_pass_through_transform(C)),
+            make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}),
+            make_tuple(Sequence<0, 3>{}, Sequence<1>{}, Sequence<2>{}, Sequence<4>{}));
+        // Merge To M, N
+        return transform_tensor_descriptor(
+            unmerged_padded_desc,
+            make_tuple(make_merge_transform(make_tuple(NumGroupsToMerge, K)),
+                       make_merge_transform(make_tuple(X, NumGroupsToMerge, C))),
+            make_tuple(Sequence<0, 1>{}, Sequence<2, 3, 4>{}),
+            make_tuple(Sequence<0>{}, Sequence<1>{}));
+    }
+
     template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
     constexpr static auto
     make_out_grid_desc(const index_t N,
@@ -221,6 +309,187 @@ struct TransformConvBwdWeightToGemmV2
             make_tuple(Sequence<0>{}, Sequence<1>{}));
     }
 
+    template <index_t NDim, typename enable_if<NDim == 1, bool>::type = false>
+    static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
+        const index_t N,
+        const index_t K,
+        const index_t C,
+        const std::array<index_t, NDimSpatial>& input_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& filter_spatial_lengths,
+        const std::array<index_t, NDimSpatial>& output_spatial_lengths,
+        const std::array<index_t, NDimSpatial + 3>& input_strides,
+        const std::array<index_t, NDimSpatial + 3>& weights_strides,
+        const std::array<index_t, NDimSpatial + 3>& output_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_strides,
+        const std::array<index_t, NDimSpatial>& conv_filter_dilations,
+        const std::array<index_t, NDimSpatial>& input_left_pads,
+        const std::array<index_t, NDimSpatial>& input_right_pads,
+        const index_t batch_k)
+    {
+        using namespace ck;
+
+        const index_t Wi = input_spatial_lengths[0];
+
+        const index_t Wo = output_spatial_lengths[0];
+
+        const index_t X = filter_spatial_lengths[0];
+
+        const index_t ConvStrideW = conv_filter_strides[0];
+
+        const index_t ConvDilationW = conv_filter_dilations[0];
+
+        const index_t InLeftPadW = input_left_pads[0];
+
+        const index_t InRightPadW = input_right_pads[0];
+
+        const index_t GemmKTotal = N * Wo;
+        const index_t GemmM      = K * NumGroupsToMerge;
+        const index_t GemmN      = C * X * NumGroupsToMerge;
+
+        const auto PadGemmM = MPerBlock - GemmM % MPerBlock;
+        const auto PadGemmN = NPerBlock - GemmN % NPerBlock;
+
+        const index_t GemmKBatch = batch_k;
+        const index_t GemmK0 =
+            math::integer_divide_ceil(GemmKTotal, GemmK1Number * K0PerBlock * GemmKBatch) *
+            K0PerBlock;
+        const index_t GemmKPad = GemmKBatch * GemmK0 * GemmK1Number;
+
+        const auto out_grid_desc = make_out_grid_desc<NDim>(N, Wo, K, output_strides);
+        const auto in_grid_desc  = make_in_grid_desc<NDim>(N, Wi, C, input_strides);
+        const auto wei_grid_desc = make_wei_grid_desc<NDim>(K, X, C, weights_strides);
+
+        if constexpr(ConvBackwardWeightSpecialization ==
+                     device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0)
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(
+                    make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                    make_merge_transform(make_tuple(NumGroupsToMerge, GemmM / NumGroupsToMerge))),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: input tensor
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(
+                    make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                    make_merge_transform(make_tuple(NumGroupsToMerge, GemmN / NumGroupsToMerge))),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                              wei_grid_desc);
+        }
+        else
+        {
+            // A: output tensor
+            const auto out_gemmkpad_gemmm_grid_desc = transform_tensor_descriptor(
+                out_grid_desc,
+                make_tuple(
+                    make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                    make_merge_transform(make_tuple(NumGroupsToMerge, GemmM / NumGroupsToMerge))),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc = transform_tensor_descriptor(
+                out_gemmkpad_gemmm_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmM)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // B: input tensor
+            const auto in_n_hip_wip_c_grid_desc = transform_tensor_descriptor(
+                in_grid_desc,
+                make_tuple(make_pass_through_transform(N),
+                           make_pad_transform(Wi, InLeftPadW, InRightPadW),
+                           make_pass_through_transform(NumGroupsToMerge),
+                           make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}));
+
+            const auto in_n_y_ho_x_wo_c_grid_desc = transform_tensor_descriptor(
+                in_n_hip_wip_c_grid_desc,
+                make_tuple(
+                    make_pass_through_transform(N),
+                    make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)),
+                    make_pass_through_transform(NumGroupsToMerge),
+                    make_pass_through_transform(C)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
+                make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{}, Sequence<4>{}));
+
+            const auto in_gemmktotal_gemmn_grid_desc = transform_tensor_descriptor(
+                in_n_y_ho_x_wo_c_grid_desc,
+                make_tuple(make_merge_transform(make_tuple(X, NumGroupsToMerge, C)),
+                           make_merge_transform(make_tuple(N, Wo))),
+                make_tuple(Sequence<1, 3, 4>{}, Sequence<0, 2>{}),
+                make_tuple(Sequence<1>{}, Sequence<0>{}));
+
+            const auto in_gemmkpad_gemmn_grid_desc = transform_tensor_descriptor(
+                in_gemmktotal_gemmn_grid_desc,
+                make_tuple(make_right_pad_transform(GemmKTotal, GemmKPad - GemmKTotal),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc = transform_tensor_descriptor(
+                in_gemmkpad_gemmn_grid_desc,
+                make_tuple(make_unmerge_transform(make_tuple(GemmKBatch * GemmK0, GemmK1Number)),
+                           make_pass_through_transform(GemmN)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}),
+                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
+
+            // Padd
+            const auto out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    out_gemmkbatch_gemmk0_gemmm_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmM, PadGemmM),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc =
+                transform_tensor_descriptor(
+                    in_gemmkbatch_gemmk0_gemmn_gemmk1_grid_desc,
+                    make_tuple(make_pass_through_transform(GemmKBatch * GemmK0),
+                               make_right_pad_transform(GemmN, PadGemmN),
+                               make_pass_through_transform(GemmK1Number)),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                    make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}));
+
+            const auto wei_gemmm_gemmn_pad_grid_desc =
+                transform_tensor_descriptor(wei_grid_desc,
+                                            make_tuple(make_right_pad_transform(GemmM, PadGemmM),
+                                                       make_right_pad_transform(GemmN, PadGemmN)),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
+                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
+
+            return make_tuple(out_gemmkbatch_gemmk0_gemmm_gemmk1_pad_grid_desc,
+                              in_gemmkbatch_gemmk0_gemmn_gemmk1_pad_grid_desc,
+                              wei_gemmm_gemmn_pad_grid_desc);
+        }
+
+    } // function end
+
     template <index_t NDim, typename enable_if<NDim == 2, bool>::type = false>
     static auto MakeABCGridDescriptor_A_K0_M_K1_B_K0_N_K1_C_M_N(
         const index_t N,
diff --git a/include/ck/utility/amd_buffer_addressing.hpp b/include/ck/utility/amd_buffer_addressing.hpp
index 328e37d009..317f324e6d 100644
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -1008,6 +1008,7 @@ llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc,
                                 index_t offset,
                                 index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds");
 
+#ifndef __HIPCC_RTC__
 template <typename T, index_t NumElemsPerThread>
 __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
                                               const index_t global_offset,
@@ -1059,5 +1060,6 @@ __device__ void amd_direct_load_global_to_lds(const T* global_base_ptr,
         src_resource, lds_ptr, sizeof(uint32_t), global_offset_bytes, 0, 0, 0);
 #endif
 }
+#endif
 
 } // namespace ck
diff --git a/include/ck/utility/amd_wave_read_first_lane.hpp b/include/ck/utility/amd_wave_read_first_lane.hpp
index 128c8e9a2c..3604712837 100644
--- a/include/ck/utility/amd_wave_read_first_lane.hpp
+++ b/include/ck/utility/amd_wave_read_first_lane.hpp
@@ -7,7 +7,7 @@
 #include "ck/utility/functional2.hpp"
 #include "ck/utility/math.hpp"
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <array>
 #include <cstddef>
 #include <cstdint>
diff --git a/include/ck/utility/amd_xdlops.hpp b/include/ck/utility/amd_xdlops.hpp
index b125e3adf6..010b7aabd3 100644
--- a/include/ck/utility/amd_xdlops.hpp
+++ b/include/ck/utility/amd_xdlops.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -533,9 +533,9 @@ struct intrin_mfma_scale_f32_32x32x64f8f6f4<32, 32>
                 reg_c.template AsType<float16_t>()[Number<0>{}],
                 0, // cbsz
                 0, // blgp
-                0, // { OPSEL_HI[0], OPSEL[0] }?
+                0, // OPSEL
                 scale_a,
-                0, // { OPSEL_HI[1], OPSEL[1] }?
+                0, // OPSEL
                 scale_b);
 #else
         ignore = reg_a;
@@ -569,9 +569,9 @@ struct intrin_mfma_scale_f32_16x16x128f8f6f4<16, 16>
                 reg_c.template AsType<float4_t>()[Number<0>{}],
                 0, // cbsz
                 0, // blgp
-                0, // { OPSEL_HI[0], OPSEL[0] }?
+                0, // OPSEL
                 scale_a,
-                0, // { OPSEL_HI[1], OPSEL[1] }?
+                0, // OPSEL
                 scale_b);
 #else
         ignore = reg_a;
diff --git a/include/ck/utility/data_type.hpp b/include/ck/utility/data_type.hpp
index f90fcf6791..2e3b09eae9 100644
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -6,16 +6,20 @@
 #include "ck/utility/amd_ck_fp8.hpp"
 #include "ck/utility/e8m0.hpp"
 #include "ck/utility/statically_indexed_array.hpp"
-#ifdef CK_CODE_GEN_RTC
+
+/// Definitions from <cstdint>, <cmath> conflict with
+/// /opt/rocm/include/hip/amd_detail/amd_hip_vector_types.h.
+
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
 using int8_t   = signed char;
 using uint8_t  = unsigned char;
 using int16_t  = signed short;
 using uint16_t = unsigned short;
 using float_t  = float;
-#endif
-namespace ck {
+#endif // __HIPCC_RTC__
 
-#ifdef CK_CODE_GEN_RTC
+namespace ck {
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 using byte = unsigned char;
 #else
 using std::byte;
@@ -2612,7 +2616,7 @@ using pk_i4x2_t = typename vector_type<pk_i4_t, 2>::type;
 using pk_i4x4_t = typename vector_type<pk_i4_t, 4>::type;
 using pk_i4x8_t = typename vector_type<pk_i4_t, 8>::type;
 
-#ifdef CK_CODE_GEN_RTC
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
 template <typename T>
 struct NumericLimits;
 
@@ -2825,6 +2829,118 @@ struct NumericLimits<bf8_ocp_t>
         return bit_cast<bf8_ocp_t>(binary_qnan);
     }
 };
+
+template <>
+struct NumericLimits<f4_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x2; // 0b0010
+    static constexpr uint8_t binary_max_normal    = 0x7; // 0b0111
+    static constexpr uint8_t binary_lowest_normal = 0xF; // 0b1111
+    static constexpr uint8_t binary_min_subnorm   = 0x1; // 0b0001
+    static constexpr uint8_t binary_max_subnorm   = 0x1; // 0b0001
+
+    static constexpr float data_max_normal_number    = 6;
+    static constexpr float data_min_subnormal_number = 0.5;
+
+    __host__ __device__ static constexpr f4_t Min() { return f4_t(binary_min_normal); }
+    __host__ __device__ static constexpr f4_t Max() { return f4_t(binary_max_normal); }
+    __host__ __device__ static constexpr f4_t Lowest() { return f4_t(binary_lowest_normal); }
+    __host__ __device__ static constexpr f4_t MinSubnorm() { return f4_t(binary_min_subnorm); }
+    __host__ __device__ static constexpr f4_t MaxSubnorm() { return f4_t(binary_max_subnorm); }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<f6_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
+    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
+    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
+    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
+    static constexpr uint8_t binary_max_subnorm   = 0x07; // 0b000111
+
+    static constexpr float data_max_normal_number    = 7.5;
+    static constexpr float data_min_subnormal_number = 0.125;
+
+    __host__ __device__ static constexpr f6_t Min() { return f6_t(binary_min_normal & 0b111111); }
+    __host__ __device__ static constexpr f6_t Max() { return f6_t(binary_max_normal & 0b111111); }
+    __host__ __device__ static constexpr f6_t Lowest()
+    {
+        return f6_t(binary_lowest_normal & 0b111111);
+    }
+    __host__ __device__ static constexpr f6_t MinSubnorm()
+    {
+        return f6_t(binary_min_subnorm & 0b111111);
+    }
+    __host__ __device__ static constexpr f6_t MaxSubnorm()
+    {
+        return f6_t(binary_max_subnorm & 0b111111);
+    }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<bf6_t>
+{
+    static constexpr uint8_t binary_min_normal    = 0x08; // 0b001000
+    static constexpr uint8_t binary_max_normal    = 0x1F; // 0b011111
+    static constexpr uint8_t binary_lowest_normal = 0x3F; // 0b111111
+    static constexpr uint8_t binary_min_subnorm   = 0x01; // 0b000001
+    static constexpr uint8_t binary_max_subnorm   = 0x03; // 0b000011
+
+    static constexpr float data_max_normal_number    = 28;
+    static constexpr float data_min_subnormal_number = 0.0625;
+
+    __host__ __device__ static constexpr bf6_t Min() { return bf6_t(binary_min_normal); }
+    __host__ __device__ static constexpr bf6_t Max() { return bf6_t(binary_max_normal); }
+    __host__ __device__ static constexpr bf6_t Lowest() { return bf6_t(binary_lowest_normal); }
+    __host__ __device__ static constexpr bf6_t MinSubnorm() { return bf6_t(binary_min_subnorm); }
+    __host__ __device__ static constexpr bf6_t MaxSubnorm() { return bf6_t(binary_max_subnorm); }
+
+    __host__ __device__ static constexpr float DataMaxNorm() { return data_max_normal_number; }
+    __host__ __device__ static constexpr float DataMinSubnorm()
+    {
+        return data_min_subnormal_number;
+    }
+};
+
+template <>
+struct NumericLimits<e8m0_bexp_t>
+{
+    static constexpr e8m0_bexp_t binary_min  = 0x00; // 0b00000000
+    static constexpr e8m0_bexp_t binary_max  = 0xFE; // 0b11111110
+    static constexpr e8m0_bexp_t binary_qnan = 0xFF; // 0b11111111
+    static constexpr e8m0_bexp_t binary_1    = 0x7F; // 0b01111111
+    static constexpr e8m0_bexp_t binary_2    = 0x80; // 0b10000000
+    static constexpr e8m0_bexp_t binary_3    = 0x82; // 0b10000010
+    static constexpr e8m0_bexp_t binary_135  = 0x87; // 0b10000111
+    static constexpr e8m0_bexp_t binary_142  = 0x8E; // 0b10001110
+
+    __host__ __device__ static constexpr e8m0_bexp_t Min() { return e8m0_bexp_t(binary_min); }
+    __host__ __device__ static constexpr e8m0_bexp_t Max() { return e8m0_bexp_t(binary_max); }
+    __host__ __device__ static constexpr e8m0_bexp_t QuietNaN() { return e8m0_bexp_t(binary_qnan); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_1() { return e8m0_bexp_t(binary_1); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_2() { return e8m0_bexp_t(binary_2); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_3() { return e8m0_bexp_t(binary_3); }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_135()
+    {
+        return e8m0_bexp_t(binary_135);
+    }
+    __host__ __device__ static constexpr e8m0_bexp_t Binary_142()
+    {
+        return e8m0_bexp_t(binary_142);
+    }
+};
 #else
 template <typename T>
 struct NumericLimits
@@ -2959,7 +3075,6 @@ struct NumericLimits<bf8_ocp_t>
         return bit_cast<bf8_ocp_t>(binary_qnan);
     }
 };
-#endif
 
 template <>
 struct NumericLimits<f4_t>
@@ -3072,6 +3187,7 @@ struct NumericLimits<e8m0_bexp_t>
         return e8m0_bexp_t(binary_142);
     }
 };
+#endif
 
 template <typename T>
 struct NumericUtils
diff --git a/include/ck/utility/enable_if.hpp b/include/ck/utility/enable_if.hpp
index 6ba63fc761..9d5403ceb2 100644
--- a/include/ck/utility/enable_if.hpp
+++ b/include/ck/utility/enable_if.hpp
@@ -4,15 +4,7 @@
 #pragma once
 
 namespace ck {
-
-#ifndef CK_CODE_GEN_RTC
-template <bool B, typename T = void>
-using enable_if = std::enable_if<B, T>;
-
-template <bool B, typename T = void>
-using enable_if_t = typename std::enable_if<B, T>::type;
-
-#else
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
 template <bool B, class T = void>
 struct enable_if
 {
@@ -26,6 +18,12 @@ struct enable_if<true, T>
 
 template <bool B, class T = void>
 using enable_if_t = typename enable_if<B, T>::type;
-#endif
 
+#else
+template <bool B, typename T = void>
+using enable_if = std::enable_if<B, T>;
+
+template <bool B, typename T = void>
+using enable_if_t = typename std::enable_if<B, T>::type;
+#endif
 } // namespace ck
diff --git a/include/ck/utility/loop_scheduler.hpp b/include/ck/utility/loop_scheduler.hpp
index 837ff66312..cbbce85007 100644
--- a/include/ck/utility/loop_scheduler.hpp
+++ b/include/ck/utility/loop_scheduler.hpp
@@ -1,12 +1,12 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
-#ifndef CK_CODE_GEN_RTC
+#pragma once
+
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <ostream>
 #endif
 
-#pragma once
-
 #include "ck/utility/common_header.hpp"
 
 namespace ck {
@@ -28,7 +28,7 @@ constexpr LoopScheduler make_default_loop_scheduler()
 
 } // namespace ck
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 inline std::ostream& operator<<(std::ostream& os, const ck::LoopScheduler& s)
 {
     switch(s)
diff --git a/include/ck/utility/magic_division.hpp b/include/ck/utility/magic_division.hpp
index 03eb7c646d..05ae9093e2 100644
--- a/include/ck/utility/magic_division.hpp
+++ b/include/ck/utility/magic_division.hpp
@@ -4,6 +4,7 @@
 #pragma once
 
 #include "ck/ck.hpp"
+#include "data_type.hpp"
 #include "integral_constant.hpp"
 #include "number.hpp"
 #include "type.hpp"
@@ -34,7 +35,7 @@ struct MagicDivision
         // WARNING: magic division is only applicable for division inside this range.
         // You should use the return value of CalculateMagicNumbers, if division is not inside this
         // range. The "else" logic below is to quiet down run-time error.
-        if(divisor >= 1 && divisor <= INT32_MAX)
+        if(divisor >= 1 && divisor <= ck::NumericLimits<int32_t>::Max())
         {
             uint32_t shift = 0;
             for(shift = 0; shift < 32; ++shift)
diff --git a/include/ck/utility/math_v2.hpp b/include/ck/utility/math_v2.hpp
index b31b46fb5f..e235f51c93 100644
--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
@@ -19,7 +19,7 @@ extern "C" __device__ float __ocml_native_recip_f32(float);
 #endif
 
 // math functions for the host,  some are implemented by calling C++ std functions
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 static inline __host__ float abs(float x) { return std::abs(x); };
 
 static inline __host__ double abs(double x) { return std::abs(x); };
@@ -924,5 +924,23 @@ inline __device__ double expm1<double>(double x)
     return expm1(x);
 };
 
+template <typename T>
+inline __device__ T cos(T x)
+{
+    return ck::type_convert<T>(cosf(ck::type_convert<float>(x)));
+};
+
+template <>
+inline __device__ float cos<float>(float x)
+{
+    return cosf(x);
+};
+
+template <>
+inline __device__ double cos<double>(double x)
+{
+    return cos(x);
+};
+
 } // namespace math
 } // namespace ck
diff --git a/include/ck/utility/sequence.hpp b/include/ck/utility/sequence.hpp
index 6061d48118..25dae4e335 100644
--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -3,7 +3,7 @@
 
 #pragma once
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 #include <ostream>
 #endif
 
@@ -902,7 +902,7 @@ using uniform_sequence_gen_t = typename uniform_sequence_gen<NSize, I>::type;
 
 } // namespace ck
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <ck::index_t... Is>
 std::ostream& operator<<(std::ostream& os, const ck::Sequence<Is...>)
 {
diff --git a/include/ck/utility/tuple_helper.hpp b/include/ck/utility/tuple_helper.hpp
index 596c748a2a..b4f1545aa9 100644
--- a/include/ck/utility/tuple_helper.hpp
+++ b/include/ck/utility/tuple_helper.hpp
@@ -159,7 +159,7 @@ __host__ __device__ constexpr auto TupleReduce(F&& f, const Tuple<Ts...>& tuple)
     }
 }
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <typename T>
 using is_tuple = decltype(ck::declval<T&>().IsTuple());
 #endif
@@ -167,7 +167,7 @@ using is_tuple = decltype(ck::declval<T&>().IsTuple());
 template <typename... Ts>
 __host__ __device__ constexpr auto IsNestedTuple(const Tuple<Ts...>&)
 {
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
     return (is_detected<is_tuple, Ts>::value || ...);
 #endif
 }
diff --git a/include/ck/utility/type.hpp b/include/ck/utility/type.hpp
index ef9326ae57..bde9c179ce 100644
--- a/include/ck/utility/type.hpp
+++ b/include/ck/utility/type.hpp
@@ -1,316 +1,313 @@
-// SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
-
-#pragma once
-
-#include "ck/ck.hpp"
-#include "ck/utility/enable_if.hpp"
-#include "ck/utility/integral_constant.hpp"
-
-namespace ck {
-#ifdef CK_CODE_GEN_RTC
-// NOLINTNEXTLINE
-#define CK_BUILTIN_TYPE_TRAIT1(name)         \
-    template <class T>                       \
-    struct name : bool_constant<__##name(T)> \
-    {                                        \
-    }
-
-// NOLINTNEXTLINE
-#define CK_BUILTIN_TYPE_TRAIT2(name)            \
-    template <class T, class U>                 \
-    struct name : bool_constant<__##name(T, U)> \
-    {                                           \
-    }
-
-// NOLINTNEXTLINE
-#define CK_BUILTIN_TYPE_TRAITN(name)             \
-    template <class... Ts>                       \
-    struct name : bool_constant<__##name(Ts...)> \
-    {                                            \
-    }
-
-CK_BUILTIN_TYPE_TRAIT1(is_class);
-CK_BUILTIN_TYPE_TRAIT1(is_pointer);
-CK_BUILTIN_TYPE_TRAIT1(is_reference);
-CK_BUILTIN_TYPE_TRAIT1(is_trivially_copyable);
-CK_BUILTIN_TYPE_TRAIT1(is_unsigned);
-CK_BUILTIN_TYPE_TRAIT2(is_base_of);
-
-template <class T>
-struct remove_cv
-{
-    using type = T;
-};
-
-template <class T>
-struct remove_cv<const T> : remove_cv<T>
-{
-};
-
-template <class T>
-struct remove_cv<volatile T> : remove_cv<T>
-{
-};
-
-template <class T>
-struct remove_reference
-{
-    typedef T type;
-};
-template <class T>
-struct remove_reference<T&>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_reference<T&&>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer<T*>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer<T* const>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer<T* volatile>
-{
-    typedef T type;
-};
-template <class T>
-struct remove_pointer<T* const volatile>
-{
-    typedef T type;
-};
-
-template <typename T>
-constexpr T&& forward(typename remove_reference<T>::type& t_) noexcept
-{
-    return static_cast<T&&>(t_);
-}
-template <typename T>
-constexpr T&& forward(typename remove_reference<T>::type&& t_) noexcept
-{
-    return static_cast<T&&>(t_);
-}
-
-template <class T>
-struct is_const : public integral_constant<bool, false>
-{
-};
-template <class T>
-struct is_const<const T> : public integral_constant<bool, true>
-{
-};
-template <class T>
-inline constexpr bool is_const_v = is_const<T>::value;
-
-template <typename T>
-inline constexpr bool is_reference_v = is_reference<T>::value;
-
-template <class T>
-struct remove_const
-{
-    typedef T type;
-};
-template <class T>
-struct remove_const<const T>
-{
-    typedef T type;
-};
-template <class T>
-using remove_const_t = typename remove_const<T>::type;
-template <class T>
-inline constexpr bool is_class_v = is_class<T>::value;
-
-template <class T>
-inline constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
-// template <typename T>
-// T&& declval() noexcept;
-
-template <class T, class U = T&&>
-U private_declval(int);
-
-template <class T>
-T private_declval(long);
-
-template <class T>
-auto declval() noexcept -> decltype(private_declval<T>(0));
-
-template <class...>
-using void_t = void;
-#else
-#include <utility>
-#include <type_traits>
-using std::declval;
-using std::forward;
-using std::is_base_of;
-using std::is_class;
-using std::is_class_v;
-using std::is_const_v;
-using std::is_pointer;
-using std::is_reference;
-using std::is_reference_v;
-using std::is_trivially_copyable;
-using std::is_trivially_copyable_v;
-using std::is_unsigned;
-using std::remove_const_t;
-using std::remove_cv;
-using std::remove_pointer;
-using std::remove_reference;
-using std::void_t;
-#endif
-
-template <typename X, typename Y>
-struct is_same : public integral_constant<bool, false>
-{
-};
-
-template <typename X>
-struct is_same<X, X> : public integral_constant<bool, true>
-{
-};
-
-template <typename X>
-struct is_floating_point : public integral_constant<bool, false>
-{
-};
-
-template <>
-struct is_floating_point<float> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_floating_point<double> : public integral_constant<bool, true>
-{
-};
-template <>
-struct is_floating_point<long double> : public integral_constant<bool, true>
-{
-};
-
-template <typename X>
-struct is_integral : public integral_constant<bool, false>
-{
-};
-
-template <>
-struct is_integral<int> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<unsigned int> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<long> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<unsigned long> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<short> : public integral_constant<bool, true>
-{
-};
-template <>
-struct is_integral<unsigned short> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<long long> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<unsigned long long> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<char> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<signed char> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<unsigned char> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<wchar_t> : public integral_constant<bool, true>
-{
-};
-template <>
-struct is_integral<char16_t> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<char32_t> : public integral_constant<bool, true>
-{
-};
-
-template <>
-struct is_integral<bool> : public integral_constant<bool, true>
-{
-};
-
-template <typename X, typename Y>
-inline constexpr bool is_same_v = is_same<X, Y>::value;
-
-template <typename X, typename Y>
-inline constexpr bool is_base_of_v = is_base_of<X, Y>::value;
-
-template <typename T>
-inline constexpr bool is_unsigned_v = is_unsigned<T>::value;
-
-template <typename T>
-using remove_reference_t = typename remove_reference<T>::type;
-
-template <typename T>
-using remove_reference_t = typename remove_reference<T>::type;
-
-template <typename T>
-using remove_cv_t = typename remove_cv<T>::type;
-template <typename T>
-using remove_cvref_t = remove_cv_t<remove_reference_t<T>>;
-
-template <typename T>
-using remove_pointer_t = typename remove_pointer<T>::type;
-
-template <typename T>
-inline constexpr bool is_pointer_v = is_pointer<T>::value;
-
-template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
-__host__ __device__ constexpr Y bit_cast(const X& x)
-{
-    static_assert(__has_builtin(__builtin_bit_cast), "");
-    static_assert(sizeof(X) == sizeof(Y), "Do not support cast between different size of type");
-
-    return __builtin_bit_cast(Y, x);
-}
-} // namespace ck
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/ck.hpp"
+#include "ck/utility/enable_if.hpp"
+#include "ck/utility/integral_constant.hpp"
+
+namespace ck {
+#if defined(__HIPCC_RTC__) || defined(CK_CODE_GEN_RTC)
+// NOLINTNEXTLINE
+#define CK_BUILTIN_TYPE_TRAIT1(name)         \
+    template <class T>                       \
+    struct name : bool_constant<__##name(T)> \
+    {                                        \
+    }
+
+// NOLINTNEXTLINE
+#define CK_BUILTIN_TYPE_TRAIT2(name)            \
+    template <class T, class U>                 \
+    struct name : bool_constant<__##name(T, U)> \
+    {                                           \
+    }
+
+// NOLINTNEXTLINE
+#define CK_BUILTIN_TYPE_TRAITN(name)             \
+    template <class... Ts>                       \
+    struct name : bool_constant<__##name(Ts...)> \
+    {                                            \
+    }
+
+CK_BUILTIN_TYPE_TRAIT1(is_class);
+CK_BUILTIN_TYPE_TRAIT1(is_pointer);
+CK_BUILTIN_TYPE_TRAIT1(is_reference);
+CK_BUILTIN_TYPE_TRAIT1(is_trivially_copyable);
+CK_BUILTIN_TYPE_TRAIT1(is_unsigned);
+CK_BUILTIN_TYPE_TRAIT2(is_base_of);
+
+template <class T>
+struct remove_cv
+{
+    using type = T;
+};
+
+template <class T>
+struct remove_cv<const T> : remove_cv<T>
+{
+};
+
+template <class T>
+struct remove_cv<volatile T> : remove_cv<T>
+{
+};
+
+template <class T>
+struct remove_reference
+{
+    typedef T type;
+};
+template <class T>
+struct remove_reference<T&>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_reference<T&&>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T*>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T* const>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T* volatile>
+{
+    typedef T type;
+};
+template <class T>
+struct remove_pointer<T* const volatile>
+{
+    typedef T type;
+};
+
+template <typename T>
+constexpr T&& forward(typename remove_reference<T>::type& t_) noexcept
+{
+    return static_cast<T&&>(t_);
+}
+template <typename T>
+constexpr T&& forward(typename remove_reference<T>::type&& t_) noexcept
+{
+    return static_cast<T&&>(t_);
+}
+
+template <class T>
+struct is_const : public integral_constant<bool, false>
+{
+};
+template <class T>
+struct is_const<const T> : public integral_constant<bool, true>
+{
+};
+template <class T>
+inline constexpr bool is_const_v = is_const<T>::value;
+
+template <typename T>
+inline constexpr bool is_reference_v = is_reference<T>::value;
+
+template <class T>
+struct remove_const
+{
+    typedef T type;
+};
+template <class T>
+struct remove_const<const T>
+{
+    typedef T type;
+};
+template <class T>
+using remove_const_t = typename remove_const<T>::type;
+template <class T>
+inline constexpr bool is_class_v = is_class<T>::value;
+
+template <class T>
+inline constexpr bool is_trivially_copyable_v = is_trivially_copyable<T>::value;
+// template <typename T>
+// T&& declval() noexcept;
+
+template <class T, class U = T&&>
+U private_declval(int);
+
+template <class T>
+T private_declval(long);
+
+template <class T>
+auto declval() noexcept -> decltype(private_declval<T>(0));
+
+template <class...>
+using void_t = void;
+#else
+#include <utility>
+#include <type_traits>
+using std::declval;
+using std::forward;
+using std::is_base_of;
+using std::is_class;
+using std::is_class_v;
+using std::is_const_v;
+using std::is_pointer;
+using std::is_reference;
+using std::is_reference_v;
+using std::is_trivially_copyable;
+using std::is_trivially_copyable_v;
+using std::is_unsigned;
+using std::remove_const_t;
+using std::remove_cv;
+using std::remove_pointer;
+using std::remove_reference;
+using std::void_t;
+#endif
+
+template <typename X, typename Y>
+struct is_same : public integral_constant<bool, false>
+{
+};
+
+template <typename X>
+struct is_same<X, X> : public integral_constant<bool, true>
+{
+};
+
+template <typename X>
+struct is_floating_point : public integral_constant<bool, false>
+{
+};
+
+template <>
+struct is_floating_point<float> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_floating_point<double> : public integral_constant<bool, true>
+{
+};
+template <>
+struct is_floating_point<long double> : public integral_constant<bool, true>
+{
+};
+
+template <typename X>
+struct is_integral : public integral_constant<bool, false>
+{
+};
+
+template <>
+struct is_integral<int> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<unsigned int> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<long> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<unsigned long> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<short> : public integral_constant<bool, true>
+{
+};
+template <>
+struct is_integral<unsigned short> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<long long> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<unsigned long long> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<char> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<signed char> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<unsigned char> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<wchar_t> : public integral_constant<bool, true>
+{
+};
+template <>
+struct is_integral<char16_t> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<char32_t> : public integral_constant<bool, true>
+{
+};
+
+template <>
+struct is_integral<bool> : public integral_constant<bool, true>
+{
+};
+
+template <typename X, typename Y>
+inline constexpr bool is_same_v = is_same<X, Y>::value;
+
+template <typename X, typename Y>
+inline constexpr bool is_base_of_v = is_base_of<X, Y>::value;
+
+template <typename T>
+inline constexpr bool is_unsigned_v = is_unsigned<T>::value;
+
+template <typename T>
+using remove_reference_t = typename remove_reference<T>::type;
+
+template <typename T>
+using remove_cv_t = typename remove_cv<T>::type;
+template <typename T>
+using remove_cvref_t = remove_cv_t<remove_reference_t<T>>;
+
+template <typename T>
+using remove_pointer_t = typename remove_pointer<T>::type;
+
+template <typename T>
+inline constexpr bool is_pointer_v = is_pointer<T>::value;
+
+template <typename Y, typename X, typename enable_if<sizeof(X) == sizeof(Y), bool>::type = false>
+__host__ __device__ constexpr Y bit_cast(const X& x)
+{
+    static_assert(__has_builtin(__builtin_bit_cast), "");
+    static_assert(sizeof(X) == sizeof(Y), "Do not support cast between different size of type");
+
+    return __builtin_bit_cast(Y, x);
+}
+} // namespace ck
diff --git a/include/ck/utility/type_convert.hpp b/include/ck/utility/type_convert.hpp
index e9b2e3fff2..cf862ae640 100644
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
@@ -279,7 +279,6 @@ inline __host__ __device__ f8_fnuz_t f8_convert_sr<f8_fnuz_t, half_t>(half_t x)
     constexpr bool clip              = true;
     constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
     constexpr int seed               = 1254739;
-
 #ifndef CK_CODE_GEN_RTC
     uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
@@ -344,7 +343,6 @@ inline __host__ __device__ bf8_fnuz_t f8_convert_sr<bf8_fnuz_t, half_t>(half_t x
     constexpr bool clip              = true;
     constexpr f8_rounding_mode rm    = f8_rounding_mode::stochastic;
     constexpr int seed               = 1254739;
-
 #ifndef CK_CODE_GEN_RTC
     uint32_t rng = prand_generator<half_t, seed>(reinterpret_cast<uintptr_t>(&x), x);
 #else
@@ -1981,7 +1979,7 @@ inline __host__ __device__ float32_t type_convert<float32_t, bf6x32_t>(bf6x32_t
 #endif
 }
 
-#ifndef CK_CODE_GEN_RTC
+#if !defined(__HIPCC_RTC__) || !defined(CK_CODE_GEN_RTC)
 template <typename Y, typename X, size_t NumElems>
 inline __host__ __device__ void array_convert(std::array<Y, NumElems>& y,
                                               const std::array<X, NumElems>& x)
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index 107aae5516..4e0deb1547 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -1309,7 +1309,9 @@ CK_TILE_DEVICE thread_buffer<T, N> amd_buffer_load_impl(int32x4_t src_wave_buffe
              (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
             (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
-            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
+            (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
+            (std::is_same<T, pk_int4_t>::value &&
+             (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)),
         "wrong! not implemented");
 
     using rtn_type = thread_buffer<T, N>;
diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp
index 78768bbbfc..fa63597db4 100644
--- a/include/ck_tile/core/container/array.hpp
+++ b/include/ck_tile/core/container/array.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -153,12 +153,12 @@ struct array<T, 0>
     CK_TILE_HOST_DEVICE void print() const { printf("array{size: 0, data: []}"); }
 };
 
-template <typename>
+template <typename, typename>
 struct vector_traits;
 
 // specialization for array
 template <typename T, index_t N>
-struct vector_traits<array<T, N>>
+struct vector_traits<array<T, N>, void>
 {
     using scalar_type                    = T;
     static constexpr index_t vector_size = N;
diff --git a/include/ck_tile/core/container/thread_buffer.hpp b/include/ck_tile/core/container/thread_buffer.hpp
index 279a48acb3..77c46e1b8c 100644
--- a/include/ck_tile/core/container/thread_buffer.hpp
+++ b/include/ck_tile/core/container/thread_buffer.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -149,17 +149,24 @@ struct thread_buffer {
 };
 // clang-format on
 
-template <typename>
+template <typename, typename>
 struct vector_traits;
 
 // specialization for array
 template <typename T, index_t N>
-struct vector_traits<thread_buffer<T, N>>
+struct vector_traits<thread_buffer<T, N>, std::enable_if_t<!std::is_class_v<T>>>
 {
     using scalar_type                    = T;
     static constexpr index_t vector_size = N;
 };
 
+template <typename T, index_t N>
+struct vector_traits<thread_buffer<T, N>, std::enable_if_t<std::is_class_v<T>>>
+{
+    using scalar_type                    = typename T::type;
+    static constexpr index_t vector_size = N;
+};
+
 #endif
 
 } // namespace ck_tile
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index 74575f4c6e..fd02177e25 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -294,7 +294,7 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
 #undef TP_COM_
 };
 
-template <typename>
+template <typename, typename = void>
 struct vector_traits;
 
 // specialization for array
diff --git a/include/ck_tile/core/numeric/bfloat16.hpp b/include/ck_tile/core/numeric/bfloat16.hpp
index 6ad38b1f7c..6f31468809 100644
--- a/include/ck_tile/core/numeric/bfloat16.hpp
+++ b/include/ck_tile/core/numeric/bfloat16.hpp
@@ -376,14 +376,12 @@ struct numeric<bfloat16_t>
     }
 };
 
-template <typename T>
-struct numeric_traits;
-
 template <>
 struct numeric_traits<bfloat16_t>
 {
-    static constexpr int exp  = 8;
-    static constexpr int mant = 7;
+    static constexpr int exp        = 8;
+    static constexpr int mant       = 7;
+    static constexpr int PackedSize = 1;
 };
 
 #if CK_TILE_USE_CUSTOM_DATA_TYPE
diff --git a/include/ck_tile/core/numeric/float8.hpp b/include/ck_tile/core/numeric/float8.hpp
index c4fc6890c6..facc3e45ee 100644
--- a/include/ck_tile/core/numeric/float8.hpp
+++ b/include/ck_tile/core/numeric/float8.hpp
@@ -207,9 +207,6 @@ using bf8_t     = unsigned _BitInt(8);
 using bf8_raw_t = uint8_t;
 #endif
 
-template <typename T>
-struct numeric_traits;
-
 template <>
 struct numeric_traits<fp8_t>
 {
@@ -225,6 +222,7 @@ struct numeric_traits<fp8_t>
     static constexpr fp8_interpretation f8_interpret = fp8_interpretation::E4M3_FNUZ;
 #endif
     static constexpr uint8_t abs_mask = 0x7F;
+    static constexpr int PackedSize   = 1;
 };
 
 template <>
@@ -242,6 +240,7 @@ struct numeric_traits<bf8_t>
     static constexpr fp8_interpretation f8_interpret = fp8_interpretation::E5M2_FNUZ;
 #endif
     static constexpr uint8_t abs_mask = 0x7F;
+    static constexpr int PackedSize   = 1;
 };
 
 // below is sw fp8 conversion, not utilizing hw instruction
diff --git a/include/ck_tile/core/numeric/half.hpp b/include/ck_tile/core/numeric/half.hpp
index 5779b170b7..8479b33f8f 100644
--- a/include/ck_tile/core/numeric/half.hpp
+++ b/include/ck_tile/core/numeric/half.hpp
@@ -223,9 +223,6 @@ struct numeric<half_t>
     }
 };
 
-template <typename T>
-struct numeric_traits;
-
 template <>
 struct numeric_traits<half_t>
 {
@@ -241,6 +238,7 @@ struct numeric_traits<half_t>
     static constexpr uint16_t NegInf    = 0xFC00;
     static constexpr uint16_t NaN       = 0x7C01;
     static constexpr uint16_t Neg0      = 0x8000;
+    static constexpr int PackedSize     = 1;
     using bitwise_type                  = uint16_t;
 };
 
@@ -383,4 +381,24 @@ half_t exp2(half_t x) { return static_cast<half_t>(exp2f(static_cast<float>(x)))
 CK_TILE_DEVICE
 half_t log(half_t x) { return static_cast<half_t>(__logf(static_cast<float>(x))); };
 #endif
+
+using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
+
+CK_TILE_HOST fp16x2_t pk_add_f16(const fp16x2_t& x, const fp16x2_t& y)
+{
+    fp16x2_t vector_res;
+
+    vector_res.x = x.x + y.x;
+    vector_res.y = x.y + y.y;
+
+    return vector_res;
+}
+
+CK_TILE_DEVICE fp16x2_t pk_add_f16(const fp16x2_t& x, const fp16x2_t& y)
+{
+    fp16x2_t c;
+    asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(c) : "v"(x), "v"(y));
+    return c;
+}
+
 } // namespace ck_tile
diff --git a/include/ck_tile/core/numeric/int8.hpp b/include/ck_tile/core/numeric/int8.hpp
index 9ca3333c39..34d9a1c4b9 100644
--- a/include/ck_tile/core/numeric/int8.hpp
+++ b/include/ck_tile/core/numeric/int8.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck_tile/core/config.hpp"
 #include "ck_tile/core/numeric/half.hpp"
@@ -74,8 +74,6 @@ struct numeric<int8_t>
 };
 
 #if 0
-template <typename T>
-struct numeric_traits;
 
 template <>
 struct numeric_traits<int8_t>
@@ -91,6 +89,7 @@ struct numeric_traits<int8_t>
     static constexpr uint32_t NegInf    = 0xFC00;
     static constexpr uint32_t NaN       = 0x7C01;
     static constexpr uint32_t Neg0      = 0x8000;
+    static constexpr int PackedSize           = 1;
     using bitwise_type                  = uint16_t;
 };
 #endif
diff --git a/include/ck_tile/core/numeric/numeric.hpp b/include/ck_tile/core/numeric/numeric.hpp
index 6b16485b48..f125fbf2ce 100644
--- a/include/ck_tile/core/numeric/numeric.hpp
+++ b/include/ck_tile/core/numeric/numeric.hpp
@@ -77,7 +77,10 @@ struct numeric
 };
 
 template <typename T>
-struct numeric_traits;
+struct numeric_traits
+{
+    static constexpr int PackedSize = 1;
+};
 
 template <>
 struct numeric_traits<float>
@@ -94,6 +97,7 @@ struct numeric_traits<float>
     static constexpr uint32_t NegInf    = 0xFF800000;
     static constexpr uint32_t NaN       = 0x7F800001;
     static constexpr uint32_t Neg0      = 0x80000000;
+    static constexpr int PackedSize     = 1;
     using bitwise_type                  = uint32_t;
 };
 
diff --git a/include/ck_tile/core/numeric/pk_int4.hpp b/include/ck_tile/core/numeric/pk_int4.hpp
index 2ffcc36ced..541093e337 100644
--- a/include/ck_tile/core/numeric/pk_int4.hpp
+++ b/include/ck_tile/core/numeric/pk_int4.hpp
@@ -21,8 +21,8 @@ struct pk_int4_t
 {
     using type = int8_t;
     type data;
-    __host__ __device__ constexpr pk_int4_t() : data{type{}} {}
-    __host__ __device__ constexpr pk_int4_t(type init) : data{init} {}
+    CK_TILE_HOST_DEVICE constexpr pk_int4_t() : data{type{}} {}
+    CK_TILE_HOST_DEVICE constexpr pk_int4_t(type init) : data{init} {}
 };
 
 // limits
@@ -91,6 +91,16 @@ struct numeric<pk_int4_t>
     CK_TILE_HOST_DEVICE static constexpr pk_int4_t zero() { return 0; }
 };
 
+template <>
+struct numeric_traits<pk_int4_t>
+{
+    static constexpr int PackedSize = 2;
+};
+
+using fp32x2_t = float __attribute__((ext_vector_type(2)));
+using fp16x2_t = _Float16 __attribute__((ext_vector_type(2)));
+using bf16x2_t = bf16_raw_t __attribute__((ext_vector_type(2)));
+
 CK_TILE_HOST_DEVICE fp32x2_t pk_int4_t_to_fp32x2_t(const pk_int4_t& x)
 {
     uint8_t x_u8 = ck_tile::bit_cast<uint8_t>(x);
diff --git a/include/ck_tile/core/numeric/vector_type.hpp b/include/ck_tile/core/numeric/vector_type.hpp
index 480da96596..b165275a8c 100644
--- a/include/ck_tile/core/numeric/vector_type.hpp
+++ b/include/ck_tile/core/numeric/vector_type.hpp
@@ -10,6 +10,7 @@
 #include "ck_tile/core/numeric/float8.hpp"
 #include "ck_tile/core/numeric/half.hpp"
 #include "ck_tile/core/numeric/bfloat16.hpp"
+#include "ck_tile/core/numeric/pk_int4.hpp"
 #include "ck_tile/core/utility/type_traits.hpp"
 
 namespace ck_tile {
@@ -30,17 +31,34 @@ struct native_t
 // of compiler errors e.g. struct A; using Ax2_t = A __attribute__((ext_vector_type(2)));  -> will
 // have compiler error
 namespace impl {
+
+template <typename T_, index_t N_, typename = void>
+struct ext_vector;
+
 template <typename T_, index_t N_>
-struct ext_vector
+struct ext_vector<T_, N_, std::enable_if_t<!std::is_class_v<typename native_t<T_>::type>>>
 {
     static constexpr index_t N = N_;
-    using value_type           = typename native_t<remove_cvref_t<T_>>::type;
+    // struct type is not supported for ext_vector
+    using value_type = typename native_t<T_>::type;
+    static_assert(!std::is_class_v<value_type>);
+    using type = value_type __attribute__((ext_vector_type(N))); // this is danguous
+};
+
+template <typename T_, index_t N_>
+struct ext_vector<T_, N_, std::enable_if_t<std::is_class_v<typename native_t<T_>::type>>>
+{
+    static constexpr index_t N = N_;
+    // struct type is not supported for ext_vector
+    using value_type = typename native_t<T_>::type::type;
     static_assert(!std::is_class_v<value_type>);
     using type = value_type __attribute__((ext_vector_type(N))); // this is danguous
 };
 
 template <typename V_, index_t Vs_, index_t N_>
-struct ext_vector<V_ __attribute__((ext_vector_type(Vs_))), N_>
+struct ext_vector<V_ __attribute__((ext_vector_type(Vs_))),
+                  N_,
+                  std::enable_if_t<!std::is_class_v<typename native_t<V_>::type>>>
 {
     static constexpr index_t N = Vs_ * N_;
     using value_type           = typename native_t<remove_cvref_t<V_>>::type;
@@ -48,6 +66,17 @@ struct ext_vector<V_ __attribute__((ext_vector_type(Vs_))), N_>
     using type = value_type __attribute__((ext_vector_type(N))); // this is danguous
 };
 
+template <typename V_, index_t Vs_, index_t N_>
+struct ext_vector<V_ __attribute__((ext_vector_type(Vs_))),
+                  N_,
+                  std::enable_if_t<std::is_class_v<typename native_t<V_>::type>>>
+{
+    static constexpr index_t N = Vs_ * N_;
+    using value_type           = typename native_t<remove_cvref_t<V_>>::type::type;
+    static_assert(!std::is_class_v<value_type>);
+    using type = value_type __attribute__((ext_vector_type(N))); // this is danguous
+};
+
 } // namespace impl
 
 template <typename T, index_t N>
@@ -55,10 +84,11 @@ using ext_vector_t = typename impl::ext_vector<T, N>::type;
 
 // by default, any type will result in a vector_size=1 with scalar_type=T traits.
 // ... unless we have other vector_traits specialization
-template <typename T>
+template <typename T, typename>
 struct vector_traits
 {
-    using scalar_type                    = remove_cvref_t<T>;
+    using scalar_type =
+        std::conditional_t<std::is_same_v<remove_cvref_t<T>, pk_int4_t>, int8_t, remove_cvref_t<T>>;
     static constexpr index_t vector_size = 1;
 };
 
@@ -66,7 +96,7 @@ struct vector_traits
 template <typename T, index_t N>
 struct vector_traits<T __attribute__((ext_vector_type(N)))>
 {
-    using scalar_type                    = T;
+    using scalar_type = std::conditional_t<std::is_same_v<T, pk_int4_t>, int8_t, T>;
     static constexpr index_t vector_size = N;
 };
 
@@ -200,21 +230,11 @@ using bf8x32_t = bf8_t __attribute((ext_vector_type(32)));
 using bf8x64_t = bf8_t __attribute((ext_vector_type(64)));
 #endif
 
-CK_TILE_HOST fp16x2_t pk_add_f16(const fp16x2_t& x, const fp16x2_t& y)
-{
-    fp16x2_t vector_res;
-
-    vector_res.x = x.x + y.x;
-    vector_res.y = x.y + y.y;
-
-    return vector_res;
-}
-
-CK_TILE_DEVICE fp16x2_t pk_add_f16(const fp16x2_t& x, const fp16x2_t& y)
-{
-    fp16x2_t c;
-    asm volatile("v_pk_add_f16 %0, %1, %2" : "=v"(c) : "v"(x), "v"(y));
-    return c;
-}
-
+// pk_int4_t
+// using pk_int4_t
+using pk_int4x2_t  = int8_t __attribute((ext_vector_type(2)));
+using pk_int4x4_t  = int8_t __attribute((ext_vector_type(4)));
+using pk_int4x8_t  = int8_t __attribute((ext_vector_type(8)));
+using pk_int4x16_t = int8_t __attribute((ext_vector_type(16)));
+using pk_int4x32_t = int8_t __attribute((ext_vector_type(32)));
 } // namespace ck_tile
diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp
index 7dffa0e555..c2a093f1ab 100644
--- a/include/ck_tile/core/tensor/buffer_view.hpp
+++ b/include/ck_tile/core/tensor/buffer_view.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -231,13 +231,18 @@ struct buffer_view<address_space_enum::global,
     int32x4_t cached_buf_res_;
     remove_cvref_t<T> invalid_element_value_ = T{0};
 
+    static constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
+
     CK_TILE_HOST_DEVICE constexpr buffer_view()
         : p_data_{}, buffer_size_{}, cached_buf_res_{0}, invalid_element_value_{}
     {
     }
 
     CK_TILE_HOST_DEVICE constexpr buffer_view(T* p_data, BufferSizeType buffer_size)
-        : p_data_{p_data}, buffer_size_{buffer_size}, cached_buf_res_{0}, invalid_element_value_{0}
+        : p_data_{p_data},
+          buffer_size_{buffer_size / PackedSize},
+          cached_buf_res_{0},
+          invalid_element_value_{0}
     {
     }
 
@@ -245,7 +250,7 @@ struct buffer_view<address_space_enum::global,
                                               BufferSizeType buffer_size,
                                               T invalid_element_value)
         : p_data_{p_data},
-          buffer_size_{buffer_size},
+          buffer_size_{buffer_size / PackedSize},
           cached_buf_res_{0},
           invalid_element_value_{invalid_element_value}
     {
@@ -255,7 +260,7 @@ struct buffer_view<address_space_enum::global,
     // Must call for buffers that need *_raw load/store
     CK_TILE_HOST_DEVICE void init_raw()
     {
-        cached_buf_res_ = make_wave_buffer_resource(p_data_, buffer_size_ * sizeof(type));
+        cached_buf_res_ = make_wave_buffer_resource(p_data_, (buffer_size_) * sizeof(type));
     }
 
     CK_TILE_DEVICE static constexpr address_space_enum get_address_space()
@@ -887,8 +892,8 @@ struct buffer_view<address_space_enum::lds,
 #endif
 
         i += linear_offset; // simplicity
-        if constexpr(std::is_same<typename vector_traits<remove_cvref_t<T>>::scalar_type,
-                                  int8_t>::value &&
+        if constexpr(std::is_same_v<typename vector_traits<remove_cvref_t<T>>::scalar_type,
+                                    int8_t> &&
                      workaround_int8_ds_write_issue)
         {
             if(is_valid_element)
@@ -897,83 +902,117 @@ struct buffer_view<address_space_enum::lds,
                 // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
                 // ds_write_b128
                 // TODO: remove this after compiler fix
-                static_assert((std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                               std::is_same<remove_cvref_t<X>, int8_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x2_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x16_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                  (std::is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                   std::is_same<remove_cvref_t<X>, int8x16_t>::value),
-                              "wrong! not implemented for this combination, please add "
-                              "implementation");
+                static_assert(
+                    (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                     std::is_same_v<remove_cvref_t<X>, int8_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x2_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8x4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8x8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                        (std::is_same_v<remove_cvref_t<T>, int8x16_t> &&
+                         std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                        // ext_vector_type for pk_int4 must use int8_t as type
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 2>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4x4_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4x8_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>) ||
+                        (std::is_same_v<remove_cvref_t<T>, pk_int4x16_t> &&
+                         std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>),
+                    "wrong! not implemented for this combination, please add "
+                    "implementation");
 
-                if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                             std::is_same<remove_cvref_t<X>, int8_t>::value)
+                if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                              std::is_same_v<remove_cvref_t<X>, int8_t>) ||
+                             (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                              std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 1>>))
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int8_t*>(&x);
                 }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x2_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x2_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 2>>))
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int16_t*>(&x);
                 }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x4_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>))
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32_t*>(&x);
                 }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x8_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>))
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32x2_t*>(&x);
                 }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x16_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>))
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32x4_t*>(&x);
                 }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x4_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8x4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x4_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4x4_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 4>>))
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32_t*>(&x);
                 }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x8_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8x8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x8_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4x8_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 8>>))
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
                     *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
                         *c_style_pointer_cast<const int32x2_t*>(&x);
                 }
-                else if constexpr(std::is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                  std::is_same<remove_cvref_t<X>, int8x16_t>::value)
+                else if constexpr((std::is_same_v<remove_cvref_t<T>, int8x16_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, int8x16_t>) ||
+                                  (std::is_same_v<remove_cvref_t<T>, pk_int4x16_t> &&
+                                   std::is_same_v<remove_cvref_t<X>, thread_buffer<pk_int4_t, 16>>))
                 {
                     // HACK: cast pointer of x is bad
                     // TODO: remove this after compiler fix
diff --git a/include/ck_tile/core/tensor/static_distributed_tensor.hpp b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
index 8d2f88af39..b73a27c8d5 100644
--- a/include/ck_tile/core/tensor/static_distributed_tensor.hpp
+++ b/include/ck_tile/core/tensor/static_distributed_tensor.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -27,6 +27,8 @@ struct static_distributed_tensor
 
     using ThreadTensorDesc =
         remove_cvref_t<decltype(StaticTileDistribution{}.get_ys_to_d_descriptor())>;
+    static constexpr index_t PackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
 
     static constexpr index_t kThreadElementSpaceSize = ThreadTensorDesc{}.get_element_space_size();
     static_assert(0 < kThreadElementSpaceSize, "Make sure tile distribution is valid");
@@ -59,7 +61,7 @@ struct static_distributed_tensor
 
     CK_TILE_HOST_DEVICE static constexpr index_t get_thread_buffer_size()
     {
-        return kThreadElementSpaceSize;
+        return kThreadElementSpaceSize / PackedSize;
     }
 
     template <index_t... YSliceOrigins, index_t... YSliceLengths>
@@ -79,8 +81,9 @@ struct static_distributed_tensor
         static_ford<sequence<YSliceLengths...>>{}([&](auto idx) {
             constexpr auto idx_ys = idx + sequence<YSliceOrigins...>{};
 
-            sliced_thread_data(number<sliced_thread_tensor_desc.calculate_offset(idx)>{}) =
-                thread_buf_[number<ThreadTensorDesc{}.calculate_offset(idx_ys)>{}];
+            sliced_thread_data(
+                number<sliced_thread_tensor_desc.calculate_offset(idx) / PackedSize>{}) =
+                thread_buf_[number<ThreadTensorDesc{}.calculate_offset(idx_ys) / PackedSize>{}];
         });
 
         return sliced_thread_data;
@@ -101,8 +104,9 @@ struct static_distributed_tensor
         static_ford<sequence<YSliceLengths...>>{}([&](auto idx) {
             constexpr auto idx_ys = idx + sequence<YSliceOrigins...>{};
 
-            thread_buf_(number<ThreadTensorDesc{}.calculate_offset(idx_ys)>{}) =
-                sliced_thread_data[number<sliced_thread_tensor_desc.calculate_offset(idx)>{}];
+            thread_buf_(number<ThreadTensorDesc{}.calculate_offset(idx_ys) / PackedSize>{}) =
+                sliced_thread_data[number<sliced_thread_tensor_desc.calculate_offset(idx) /
+                                          PackedSize>{}];
         });
     }
 
@@ -115,7 +119,7 @@ struct static_distributed_tensor
         constexpr auto y_idx = get_tile_distribution().get_y_indices_from_distributed_indices(
             TileDistributedIndices{});
 
-        return thread_buf_[number<ThreadTensorDesc{}.calculate_offset(y_idx)>{}];
+        return thread_buf_[number<ThreadTensorDesc{}.calculate_offset(y_idx) / PackedSize>{}];
     }
 
     template <typename TileDistributedIndices>
@@ -127,11 +131,11 @@ struct static_distributed_tensor
         constexpr auto y_idx = get_tile_distribution().get_y_indices_from_distributed_indices(
             TileDistributedIndices{});
 
-        return thread_buf_(number<ThreadTensorDesc{}.calculate_offset(y_idx)>{});
+        return thread_buf_(number<ThreadTensorDesc{}.calculate_offset(y_idx) / PackedSize>{});
     }
 
     //
-    thread_buffer<DataType, kThreadElementSpaceSize> thread_buf_;
+    thread_buffer<DataType, get_thread_buffer_size()> thread_buf_;
 };
 
 template <typename DataType, typename StaticTileDistribution>
diff --git a/include/ck_tile/core/tensor/tensor_view.hpp b/include/ck_tile/core/tensor/tensor_view.hpp
index 4c72ed0859..336793c5b1 100644
--- a/include/ck_tile/core/tensor/tensor_view.hpp
+++ b/include/ck_tile/core/tensor/tensor_view.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -45,6 +45,8 @@ struct tensor_view
     using TensorIndex = array<index_t, TensorDesc::get_num_of_top_dimension()>;
     using TensorCoord = decltype(make_tensor_coordinate(TensorDesc{}, TensorIndex{}));
     static constexpr auto DstInMemOp = DstInMemOp_;
+    static constexpr index_t PackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
 
     CK_TILE_HOST_DEVICE constexpr tensor_view() = default;
 
@@ -81,8 +83,8 @@ struct tensor_view
                             bool_constant<oob_conditional_check> = {}) const
     {
         return buf_.template get<X>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             bool_constant<oob_conditional_check>{});
     }
@@ -99,8 +101,8 @@ struct tensor_view
                             bool is_valid_element, // flag
                             bool_constant<oob_conditional_check> = {}) const
     {
-        return buf_.template get<X>(coord.get_offset(),
-                                    linear_offset,
+        return buf_.template get<X>(coord.get_offset() / PackedSize,
+                                    linear_offset / PackedSize,
                                     is_valid_element,
                                     bool_constant<oob_conditional_check>{});
     }
@@ -122,8 +124,8 @@ struct tensor_view
     {
         return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
             dst,
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             bool_constant<pre_nop>{});
     }
@@ -142,8 +144,12 @@ struct tensor_view
                                                          bool_constant<oob_conditional_check> = {},
                                                          bool_constant<pre_nop> = {}) const
     {
-        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(
-            dst, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
+        return buf_.template get_raw<X, oob_conditional_check, pre_nop>(dst,
+                                                                        coord.get_offset() /
+                                                                            PackedSize,
+                                                                        linear_offset / PackedSize,
+                                                                        is_valid_element,
+                                                                        bool_constant<pre_nop>{});
     }
 
     template <typename X,
@@ -159,8 +165,8 @@ struct tensor_view
     {
         return buf_.template async_get<X>(
             smem,
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             bool_constant<oob_conditional_check>{});
     }
@@ -178,8 +184,8 @@ struct tensor_view
                                   bool is_valid_element) const
     {
         return buf_.template async_get<X>(smem,
-                                          coord.get_offset(),
-                                          linear_offset,
+                                          coord.get_offset() / PackedSize,
+                                          linear_offset / PackedSize,
                                           is_valid_element,
                                           bool_constant<oob_conditional_check>{});
     }
@@ -198,8 +204,8 @@ struct tensor_view
     {
         return buf_.template async_get_raw<X>(
             smem,
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             bool_constant<pre_nop>{});
     }
@@ -217,8 +223,11 @@ struct tensor_view
                                       bool is_valid_element,
                                       bool_constant<pre_nop> = {}) const
     {
-        return buf_.template async_get_raw<X>(
-            smem, coord.get_offset(), linear_offset, is_valid_element, bool_constant<pre_nop>{});
+        return buf_.template async_get_raw<X>(smem,
+                                              coord.get_offset() / PackedSize,
+                                              linear_offset / PackedSize,
+                                              is_valid_element,
+                                              bool_constant<pre_nop>{});
     }
 
     // X is vector of DataType.
@@ -236,8 +245,8 @@ struct tensor_view
                             bool_constant<oob_conditional_check> = {})
     {
         buf_.template set<X, oob_conditional_check>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             x);
     }
@@ -272,8 +281,8 @@ struct tensor_view
                                 bool_constant<oob_conditional_check> = {})
     {
         buf_.template set_raw<X, oob_conditional_check>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             x);
     }
@@ -292,7 +301,7 @@ struct tensor_view
                                 bool_constant<oob_conditional_check> = {})
     {
         buf_.template set_raw<X, oob_conditional_check>(
-            coord.get_offset(), linear_offset, is_valid_element, x);
+            coord.get_offset() / PackedSize, linear_offset / PackedSize, is_valid_element, x);
     }
 
     // X is vector of DataType.
@@ -310,8 +319,8 @@ struct tensor_view
                                bool_constant<oob_conditional_check> = {})
     {
         buf_.template update<DstInMemOp, X, oob_conditional_check>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             x);
     }
@@ -330,7 +339,7 @@ struct tensor_view
                                bool_constant<oob_conditional_check> = {})
     {
         buf_.template update<DstInMemOp, X, oob_conditional_check>(
-            coord.get_offset(), linear_offset, is_valid_element, x);
+            coord.get_offset() / PackedSize, linear_offset / PackedSize, is_valid_element, x);
     }
 
     // X is vector of DataType.
@@ -350,8 +359,8 @@ struct tensor_view
                                    bool_constant<pre_nop>               = {})
     {
         buf_.template update_raw<DstInMemOp, X, oob_conditional_check, pre_nop>(
-            coord.get_offset(),
-            linear_offset,
+            coord.get_offset() / PackedSize,
+            linear_offset / PackedSize,
             coordinate_has_valid_offset_assuming_top_index_is_valid(desc_, coord),
             x);
     }
@@ -372,7 +381,7 @@ struct tensor_view
                                    bool_constant<pre_nop>               = {})
     {
         buf_.template update_raw<DstInMemOp, X, oob_conditional_check, pre_nop>(
-            coord.get_offset(), linear_offset, is_valid_element, x);
+            coord.get_offset() / PackedSize, linear_offset / PackedSize, is_valid_element, x);
     }
 
     CK_TILE_HOST_DEVICE void print() const
diff --git a/include/ck_tile/core/tensor/tile_window.hpp b/include/ck_tile/core/tensor/tile_window.hpp
index 27c2c24ad5..3bb728df23 100644
--- a/include/ck_tile/core/tensor/tile_window.hpp
+++ b/include/ck_tile/core/tensor/tile_window.hpp
@@ -97,13 +97,15 @@ struct tile_window_with_static_distribution
         }
 
         public:
+        static constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
         static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
         static constexpr index_t ScalarPerVector =
             get_vector_dim_y_scalar_per_vector().template at<1>();
 
         // using vector_type_t = vector_type_maker_t<DataType, ScalarPerVector>;
         // using vector_t      = typename vector_type_t::type;
-        using vector_t = thread_buffer<DataType, ScalarPerVector>;
+        using vector_t = thread_buffer<DataType, ScalarPerVector / PackedSize>;
 
         private:
         static constexpr auto scalars_per_access_ = [] {
@@ -336,7 +338,7 @@ struct tile_window_with_static_distribution
                         bottom_tensor_thread_coord, 0, bool_constant<oob_conditional_check>{});
 #if 1
                 // write into distributed tensor
-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                     constexpr auto idx_ys = generate_tuple(
                         [&](auto jj) {
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -345,10 +347,11 @@ struct tile_window_with_static_distribution
                         number<NDimY>{});
 
                     constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;
 
                     dst_tensor.get_thread_buffer().template at<d>() =
-                        vec_value.template get_as<DataType>()[j];
+                        vec_value.template get_as<DataType>()[j / Traits::PackedSize];
                 });
 #else
                 constexpr index_t d =
@@ -390,8 +393,9 @@ struct tile_window_with_static_distribution
         using SFC_Ys   = typename Traits::SFC_Ys;
         static constexpr index_t YElementSize =
             TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
-        static_assert(YElementSize % Traits::ScalarPerVector == 0);
-        using vectorized_tbuf = array<vector_t, YElementSize / Traits::ScalarPerVector>;
+        static_assert(YElementSize % (Traits::PackedSize * Traits::ScalarPerVector) == 0);
+        using vectorized_tbuf =
+            array<vector_t, YElementSize / (Traits::PackedSize * Traits::ScalarPerVector)>;
         // StaticBuffer<address_space_enum::vgpr,
         //                                      vector_t,
         //                                      YElementSize / Traits::ScalarPerVector,
@@ -419,7 +423,8 @@ struct tile_window_with_static_distribution
                 // data index [y0, y1, ...]
                 constexpr auto idx_ys_start = SFC_Ys::get_index(iAccess);
                 constexpr index_t d =
-                    tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+                    tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start) /
+                    Traits::PackedSize;
                 static_assert(d % Traits::ScalarPerVector == 0);
 
                 get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
@@ -632,7 +637,7 @@ struct tile_window_with_static_distribution
                 // vector_type_t vec;
                 vector_t vec_value;
 
-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                     constexpr auto idx_ys = generate_tuple(
                         [&](auto jj) {
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -641,9 +646,10 @@ struct tile_window_with_static_distribution
                         number<NDimY>{});
 
                     constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;
 
-                    vec_value.template get_as<DataType>()(j) =
+                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
@@ -698,7 +704,7 @@ struct tile_window_with_static_distribution
 
                 // read from distributed tensor
                 vector_t vec_value;
-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                     constexpr auto idx_ys = generate_tuple(
                         [&](auto jj) {
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -706,8 +712,9 @@ struct tile_window_with_static_distribution
                         },
                         number<NDimY>{});
                     constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
-                    vec_value.template get_as<DataType>()(j) =
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;
+                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
@@ -759,7 +766,7 @@ struct tile_window_with_static_distribution
                 // read from distributed tensor
                 vector_t vec_value;
 
-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                     constexpr auto idx_ys = generate_tuple(
                         [&](auto jj) {
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -768,9 +775,10 @@ struct tile_window_with_static_distribution
                         number<NDimY>{});
 
                     constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;
 
-                    vec_value.template get_as<DataType>()(j) =
+                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
@@ -825,7 +833,7 @@ struct tile_window_with_static_distribution
                 // read from distributed tensor
                 vector_t vec_value;
 
-                static_for<0, Traits::ScalarPerVector, 1>{}([&](auto j) {
+                static_for<0, Traits::ScalarPerVector, Traits::PackedSize>{}([&](auto j) {
                     constexpr auto idx_ys = generate_tuple(
                         [&](auto jj) {
                             return jj == Traits::VectorDimY ? (idx_ys_start[jj] + j)
@@ -834,9 +842,10 @@ struct tile_window_with_static_distribution
                         number<NDimY>{});
 
                     constexpr index_t d =
-                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                        tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                        Traits::PackedSize;
 
-                    vec_value.template get_as<DataType>()(j) =
+                    vec_value.template get_as<DataType>()(j / Traits::PackedSize) =
                         dstr_tensor.get_thread_buffer().template at<d>();
                 });
 
diff --git a/include/ck_tile/core/tensor/tile_window_linear.hpp b/include/ck_tile/core/tensor/tile_window_linear.hpp
index 96a8352c04..1e24e660f6 100644
--- a/include/ck_tile/core/tensor/tile_window_linear.hpp
+++ b/include/ck_tile/core/tensor/tile_window_linear.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 #include "ck_tile/core/arch/arch.hpp"
@@ -151,11 +151,13 @@ struct tile_window_linear
         }
 
         public:
+        static constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
         static constexpr index_t VectorDimY = get_vector_dim_y_scalar_per_vector().template at<0>();
         static constexpr index_t ScalarPerVector =
             get_vector_dim_y_scalar_per_vector().template at<1>();
 
-        using vector_t = thread_buffer<DataType, ScalarPerVector>;
+        using vector_t = thread_buffer<DataType, ScalarPerVector / PackedSize>;
 
         private:
         static constexpr auto scalars_per_access_ = [] {
@@ -498,17 +500,18 @@ struct tile_window_linear
             // data index [y0, y1, ...]
             constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
             // write into distributed tensor
-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
                         return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
                     },
                     number<NDimY>{});
 
-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;
 
                 dst_tensor.get_thread_buffer().template at<d>() =
-                    vec_value.template get_as<DataType>()[j];
+                    vec_value.template get_as<DataType>()[j / traits::PackedSize];
             });
 #else
             constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
@@ -556,17 +559,18 @@ struct tile_window_linear
             // data index [y0, y1, ...]
             constexpr auto idx_diff_ys = SFC_Ys::get_index(IAccess);
             // write into distributed tensor
-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
                         return jj == traits::VectorDimY ? (idx_diff_ys[jj] + j) : idx_diff_ys[jj];
                     },
                     number<NDimY>{});
 
-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;
 
                 dst_tensor.get_thread_buffer().template at<d>() =
-                    vec_value.template get_as<DataType>()[j];
+                    vec_value.template get_as<DataType>()[j / traits::PackedSize];
             });
 #else
             constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
@@ -595,8 +599,9 @@ struct tile_window_linear
         using SFC_Ys   = typename traits::SFC_Ys;
         static constexpr index_t YElementSize =
             TileDstr{}.get_ys_to_d_descriptor().get_element_space_size();
-        static_assert(YElementSize % traits::ScalarPerVector == 0);
-        using vectorized_tbuf = array<vector_t, YElementSize / traits::ScalarPerVector>;
+        static_assert(YElementSize % (traits::PackedSize * traits::ScalarPerVector) == 0);
+        using vectorized_tbuf =
+            array<vector_t, YElementSize / (traits::PackedSize * traits::ScalarPerVector)>;
 
         constexpr auto tile_dstr = TileDstr{};
 
@@ -620,7 +625,9 @@ struct tile_window_linear
 
             // data index [y0, y1, ...]
             constexpr auto idx_ys_start = SFC_Ys::get_index(IAccess);
-            constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start);
+            constexpr index_t d =
+                tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys_start) /
+                traits::PackedSize;
             static_assert(d % traits::ScalarPerVector == 0);
 
             get_bottom_tensor_view().template get_vectorized_elements_raw<vector_t>(
@@ -804,16 +811,17 @@ struct tile_window_linear
             // read from distributed tensor
             vector_t vec_value;
 
-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
                         return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
                     },
                     number<NDimY>{});
 
-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;
 
-                vec_value.template get_as<DataType>()(j) =
+                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
@@ -852,14 +860,15 @@ struct tile_window_linear
 
             // read from distributed tensor
             vector_t vec_value;
-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
                         return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
                     },
                     number<NDimY>{});
-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
-                vec_value.template get_as<DataType>()(j) =
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;
+                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
@@ -897,16 +906,17 @@ struct tile_window_linear
             // read from distributed tensor
             vector_t vec_value;
 
-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
                         return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
                     },
                     number<NDimY>{});
 
-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;
 
-                vec_value.template get_as<DataType>()(j) =
+                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
@@ -948,16 +958,17 @@ struct tile_window_linear
             // read from distributed tensor
             vector_t vec_value;
 
-            static_for<0, traits::ScalarPerVector, 1>{}([&](auto j) {
+            static_for<0, traits::ScalarPerVector, traits::PackedSize>{}([&](auto j) {
                 constexpr auto idx_ys = generate_tuple(
                     [&](auto jj) {
                         return jj == traits::VectorDimY ? (idx_ys_start[jj] + j) : idx_ys_start[jj];
                     },
                     number<NDimY>{});
 
-                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys);
+                constexpr index_t d = tile_dstr.get_ys_to_d_descriptor().calculate_offset(idx_ys) /
+                                      traits::PackedSize;
 
-                vec_value.template get_as<DataType>()(j) =
+                vec_value.template get_as<DataType>()(j / traits::PackedSize) =
                     dstr_tensor.get_thread_buffer().template at<d>();
             });
 
diff --git a/include/ck_tile/host/check_err.hpp b/include/ck_tile/host/check_err.hpp
index ea70563d58..745c18d6dd 100644
--- a/include/ck_tile/host/check_err.hpp
+++ b/include/ck_tile/host/check_err.hpp
@@ -29,11 +29,12 @@ double get_relative_threshold(const int number_of_accumulations = 1)
     using I8   = int8_t;
     using I32  = int32_t;
 
-    static_assert(is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
-                  "Warning: Unhandled ComputeDataType for setting up the relative threshold!");
+    static_assert(
+        is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
+        "Warning: Unhandled ComputeDataType for setting up the relative threshold!");
 
     double compute_error = 0;
-    if constexpr(is_any_of<ComputeDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
     {
         return 0;
     }
@@ -42,11 +43,11 @@ double get_relative_threshold(const int number_of_accumulations = 1)
         compute_error = std::pow(2, -numeric_traits<ComputeDataType>::mant) * 0.5;
     }
 
-    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
                   "Warning: Unhandled OutDataType for setting up the relative threshold!");
 
     double output_error = 0;
-    if constexpr(is_any_of<OutDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
     {
         return 0;
     }
@@ -56,11 +57,11 @@ double get_relative_threshold(const int number_of_accumulations = 1)
     }
     double midway_error = std::max(compute_error, output_error);
 
-    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
                   "Warning: Unhandled AccDataType for setting up the relative threshold!");
 
     double acc_error = 0;
-    if constexpr(is_any_of<AccDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
     {
         return 0;
     }
@@ -82,12 +83,13 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
     using I8   = int8_t;
     using I32  = int32_t;
 
-    static_assert(is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
-                  "Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
+    static_assert(
+        is_any_of<ComputeDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
+        "Warning: Unhandled ComputeDataType for setting up the absolute threshold!");
 
     auto expo            = std::log2(std::abs(max_possible_num));
     double compute_error = 0;
-    if constexpr(is_any_of<ComputeDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<ComputeDataType, pk_int4_t, I8, I32, int>::value)
     {
         return 0;
     }
@@ -96,11 +98,11 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
         compute_error = std::pow(2, expo - numeric_traits<ComputeDataType>::mant) * 0.5;
     }
 
-    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<OutDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
                   "Warning: Unhandled OutDataType for setting up the absolute threshold!");
 
     double output_error = 0;
-    if constexpr(is_any_of<OutDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<OutDataType, pk_int4_t, I8, I32, int>::value)
     {
         return 0;
     }
@@ -110,11 +112,11 @@ double get_absolute_threshold(const double max_possible_num, const int number_of
     }
     double midway_error = std::max(compute_error, output_error);
 
-    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, I8, I32, int>::value,
+    static_assert(is_any_of<AccDataType, F8, BF8, F16, BF16, F32, pk_int4_t, I8, I32, int>::value,
                   "Warning: Unhandled AccDataType for setting up the absolute threshold!");
 
     double acc_error = 0;
-    if constexpr(is_any_of<AccDataType, I8, I32, int>::value)
+    if constexpr(is_any_of<AccDataType, pk_int4_t, I8, I32, int>::value)
     {
         return 0;
     }
diff --git a/include/ck_tile/host/fill.hpp b/include/ck_tile/host/fill.hpp
index f24c338755..006026470b 100644
--- a/include/ck_tile/host/fill.hpp
+++ b/include/ck_tile/host/fill.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -282,7 +282,14 @@ struct FillMonotonicSeq
     {
         std::generate(first, last, [=, n = init_value_]() mutable {
             auto tmp = n;
-            n += step_;
+            if constexpr(std::is_same_v<decltype(tmp), pk_int4_t>)
+            {
+                n.data += step_.data;
+            }
+            else
+            {
+                n += step_;
+            }
             return tmp;
         });
     }
diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp
index 2047ad7793..a43877c6da 100644
--- a/include/ck_tile/host/host_tensor.hpp
+++ b/include/ck_tile/host/host_tensor.hpp
@@ -281,18 +281,18 @@ struct HostTensor
     using Data       = std::vector<T>;
 
     template <typename X>
-    HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(mDesc.get_element_space_size())
+    HostTensor(std::initializer_list<X> lens) : mDesc(lens), mData(get_element_space_size())
     {
     }
 
     template <typename X, typename Y>
     HostTensor(std::initializer_list<X> lens, std::initializer_list<Y> strides)
-        : mDesc(lens, strides), mData(mDesc.get_element_space_size())
+        : mDesc(lens, strides), mData(get_element_space_size())
     {
     }
 
     template <typename Lengths>
-    HostTensor(const Lengths& lens) : mDesc(lens), mData(mDesc.get_element_space_size())
+    HostTensor(const Lengths& lens) : mDesc(lens), mData(get_element_space_size())
     {
     }
 
@@ -302,7 +302,7 @@ struct HostTensor
     {
     }
 
-    HostTensor(const Descriptor& desc) : mDesc(desc), mData(mDesc.get_element_space_size()) {}
+    HostTensor(const Descriptor& desc) : mDesc(desc), mData(get_element_space_size()) {}
 
     template <typename OutT>
     HostTensor<OutT> CopyAsType() const
@@ -340,7 +340,11 @@ struct HostTensor
 
     std::size_t get_element_size() const { return mDesc.get_element_size(); }
 
-    std::size_t get_element_space_size() const { return mDesc.get_element_space_size(); }
+    std::size_t get_element_space_size() const
+    {
+        constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
+        return mDesc.get_element_space_size() / PackedSize;
+    }
 
     std::size_t get_element_space_size_in_bytes() const
     {
@@ -463,29 +467,27 @@ struct HostTensor
     template <typename... Is>
     std::size_t GetOffsetFromMultiIndex(Is... is) const
     {
-        return mDesc.GetOffsetFromMultiIndex(is...);
+        constexpr index_t PackedSize = ck_tile::numeric_traits<remove_cvref_t<T>>::PackedSize;
+        return mDesc.GetOffsetFromMultiIndex(is...) / PackedSize;
     }
 
     template <typename... Is>
     T& operator()(Is... is)
     {
-        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        return mData[GetOffsetFromMultiIndex(is...)];
     }
 
     template <typename... Is>
     const T& operator()(Is... is) const
     {
-        return mData[mDesc.GetOffsetFromMultiIndex(is...)];
+        return mData[GetOffsetFromMultiIndex(is...)];
     }
 
-    T& operator()(std::vector<std::size_t> idx)
-    {
-        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
-    }
+    T& operator()(std::vector<std::size_t> idx) { return mData[GetOffsetFromMultiIndex(idx)]; }
 
     const T& operator()(std::vector<std::size_t> idx) const
     {
-        return mData[mDesc.GetOffsetFromMultiIndex(idx)];
+        return mData[GetOffsetFromMultiIndex(idx)];
     }
 
     HostTensor<T> transpose(std::vector<size_t> axes = {}) const
diff --git a/include/ck_tile/host/reference/reference_gemm.hpp b/include/ck_tile/host/reference/reference_gemm.hpp
index da0de457d4..fe5077083c 100644
--- a/include/ck_tile/host/reference/reference_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -34,11 +34,35 @@ CK_TILE_HOST void reference_gemm(const HostTensor<ADataType>& a_m_k,
 
         for(std::size_t k = 0; k < K; ++k)
         {
-            ADataType v_a = a_element_op(a_m_k(m, k));
-            BDataType v_b = b_element_op(b_k_n(k, n));
-
-            v_acc +=
-                ck_tile::type_convert<AccDataType>(v_a) * ck_tile::type_convert<AccDataType>(v_b);
+            AccDataType v_a;
+            AccDataType v_b;
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                const pk_int4_t pk_val  = a_element_op(a_m_k(m, k));
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else
+            {
+                v_a = ck_tile::type_convert<AccDataType>(a_element_op(a_m_k(m, k)));
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                const pk_int4_t pk_val  = b_element_op(b_k_n(k, n));
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(pk_val);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+            }
+            else
+            {
+                v_b = ck_tile::type_convert<AccDataType>(b_element_op(b_k_n(k, n)));
+            }
+            v_acc += v_a * v_b;
         }
 
         c_m_n(m, n) = ck_tile::type_convert<CDataType>(acc_element_op(v_acc));
@@ -73,6 +97,8 @@ __global__ void naive_gemm_kernel(ADataType* A,
         AccDataType acc = 0.0;
         for(int k = 0; k < K; ++k)
         {
+            constexpr index_t packed_size_a = ck_tile::numeric_traits<ADataType>::PackedSize;
+            constexpr index_t packed_size_b = ck_tile::numeric_traits<BDataType>::PackedSize;
             // Adjust indexing based on matrix layout
             int a_index = (std::is_same_v<LayoutA, tensor_layout::gemm::RowMajor>)
                               ? row * strideA + k
@@ -80,8 +106,34 @@ __global__ void naive_gemm_kernel(ADataType* A,
             int b_index = (std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
                               ? col * strideB + k
                               : k * strideB + col;
-            acc += ck_tile::type_convert<AccDataType>(A[a_index]) *
-                   ck_tile::type_convert<AccDataType>(B[b_index]);
+
+            AccDataType v_a;
+            AccDataType v_b;
+            if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+            {
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(A[a_index / packed_size_a]);
+                if(k % 2 == 1)
+                    v_a = fp32_val.hi;
+                else
+                    v_a = fp32_val.lo;
+            }
+            else
+            {
+                v_a = ck_tile::type_convert<AccDataType>(A[a_index]);
+            }
+            if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+            {
+                const fp32x2_t fp32_val = pk_int4_t_to_fp32x2_t(B[b_index / packed_size_b]);
+                if(k % 2 == 1)
+                    v_b = fp32_val.hi;
+                else
+                    v_b = fp32_val.lo;
+            }
+            else
+            {
+                v_b = ck_tile::type_convert<AccDataType>(B[b_index]);
+            }
+            acc += v_a * v_b;
         }
 
         int c_index = (std::is_same_v<LayoutC, tensor_layout::gemm::RowMajor>)
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index 3e8dac30ef..a3a0df996d 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -9,20 +9,166 @@
 namespace ck_tile {
 namespace element_wise {
 
-#if 0
+// Fast int4x4 to fp16x8_t data type conversion based on paper
+// [Who Says Elephants Can't Run: Bringing Large Scale MoE Models into Cloud Scale Production]
+// (https://arxiv.org/abs/2211.10017) and implementation:
+// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
+CK_TILE_DEVICE fp16x4_t i4_to_half4(int q)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+
+    int lo;
+    int hi;
+    // Extract the two int4 at low bit and create two fp16 number.
+    asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(lo) : "v"(q), "v"(LO), "v"(EX));
+    // Extract the two int4 at hight bit and create two fp16 number.
+    asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(hi) : "v"(q), "v"(HI), "v"(EX));
+
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}
+
+    fp16x4_t res;
+
+    // for two fp16 from lowbit, subtract 1032 to get correct fp16 value
+    asm volatile("v_pk_add_f16 %0, %1, %2"
+                 : "=v"(res.lo)
+                 : "v"(bit_cast<fp16x2_t>(lo)), "v"(bit_cast<fp16x2_t>(SUB)));
+
+    // for two fp16 from highbit, divide 16 and subtract 72 to get correct fp16 value
+    asm volatile(
+        "v_pk_fma_f16 %0, %1, %2, %3"
+        : "=v"(res.hi)
+        : "v"(bit_cast<fp16x2_t>(hi)), "v"(bit_cast<fp16x2_t>(MUL)), "v"(bit_cast<fp16x2_t>(ADD)));
+
+    return res;
+}
+
+CK_TILE_DEVICE fp16x4_t i4_to_half4_scale(int q, const fp16x2_t& scale)
+{
+    const int LO = 0x000f000f;
+    const int HI = 0x00f000f0;
+    const int EX = 0x64006400;
+
+    int lo;
+    int hi;
+    // Extract the two int4 at low bit and create two fp16 number.
+    asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(lo) : "v"(q), "v"(LO), "v"(EX));
+    // Extract the two int4 at hight bit and create two fp16 number.
+    asm volatile("v_and_or_b32 %0, %1, %2, %3" : "=v"(hi) : "v"(q), "v"(HI), "v"(EX));
+
+    const int SUB = 0xE408E408; // half2 {-1032, -1032}
+    const int MUL = 0x2c002c00; // half2 {1 / 16, 1 / 16}
+    const int ADD = 0xd480d480; // half2 {-72, -72}
+
+    fp16x4_t res;
+
+    asm volatile("v_pk_add_f16 %0, %1, %2"
+                 : "=v"(res.lo)
+                 : "v"(bit_cast<fp16x2_t>(lo)), "v"(bit_cast<fp16x2_t>(SUB)));
+
+    asm volatile(
+        "v_pk_fma_f16 %0, %1, %2, %3"
+        : "=v"(res.hi)
+        : "v"(bit_cast<fp16x2_t>(hi)), "v"(bit_cast<fp16x2_t>(MUL)), "v"(bit_cast<fp16x2_t>(ADD)));
+
+    asm volatile("v_pk_mul_f16 %0, %1, %2" : "=v"(res.lo) : "v"(res.lo), "v"(scale));
+
+    asm volatile("v_pk_mul_f16 %0, %1, %2" : "=v"(res.hi) : "v"(res.hi), "v"(scale));
+
+    return res;
+}
+
+CK_TILE_DEVICE bf16x4_t i4_to_bhalf4(int q)
+{
+    uint32_t i8s = (q & 0xf) | ((q & 0xf0) << 4) | ((q & 0xf00) << 8) | ((q & 0xf000) << 12);
+
+    static constexpr uint32_t fp32_base = 0x4B000000;
+
+    float fp32_intermediates[4];
+
+    uint32_t* fp32_intermediates_casted = reinterpret_cast<uint32_t*>(fp32_intermediates);
+
+    fp32_intermediates_casted[0] = __byte_perm(i8s, fp32_base, 0x7650);
+    fp32_intermediates_casted[1] = __byte_perm(i8s, fp32_base, 0x7651);
+    fp32_intermediates_casted[2] = __byte_perm(i8s, fp32_base, 0x7652);
+    fp32_intermediates_casted[3] = __byte_perm(i8s, fp32_base, 0x7653);
+
+    fp32_intermediates[0] -= 8388616.f;
+    fp32_intermediates[1] -= 8388616.f;
+    fp32_intermediates[2] -= 8388616.f;
+    fp32_intermediates[3] -= 8388616.f;
+
+    bf16x4_t res;
+    res.lo = bit_cast<bf16x2_t>(
+        __byte_perm(fp32_intermediates_casted[1], fp32_intermediates_casted[0], 0x7632));
+    res.hi = bit_cast<bf16x2_t>(
+        __byte_perm(fp32_intermediates_casted[3], fp32_intermediates_casted[2], 0x7632));
+
+    return res;
+}
+
+struct PassThroughPack8
+{
+    template <typename Y, typename X>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
+
+    CK_TILE_HOST_DEVICE constexpr void operator()(fp16x8_t& y, const pk_int4x4_t& x) const
+    {
+        y.lo = i4_to_half4(bit_cast<int>(x));
+        y.hi = i4_to_half4(bit_cast<int>(x) >> 8);
+    }
+
+    CK_TILE_HOST_DEVICE constexpr void operator()(bf16x8_t& y, const pk_int4x4_t& x) const
+    {
+        y.lo = i4_to_bhalf4(bit_cast<int>(x));
+        y.hi = i4_to_bhalf4(bit_cast<int>(x) >> 16);
+    }
+    constexpr const static bool is_pack8_invocable = true;
+};
+
+struct DequantPack8
+{
+    template <typename Y, typename X, typename Z>
+    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x, const Z& z) const;
+
+    CK_TILE_HOST_DEVICE constexpr void
+    operator()(fp16x8_t& y, const pk_int4x4_t& x, const fp16x2_t& z) const
+    {
+        y.lo = i4_to_half4_scale(bit_cast<int>(x), z);
+        y.hi = i4_to_half4_scale(bit_cast<int>(x) >> 8, z);
+    }
+
+    constexpr const static bool is_pack8_invocable = true;
+};
+
 struct PassThroughPack2
 {
     template <typename Y, typename X>
     CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
 
-    CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::half2_t& y, const ck_tile::f8x2_t& x) const
+#if 0
+    CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::fp16x2_t& y, const ck_tile::f8x2_t& x) const
     {
         auto t = type_convert<float2_t>(x);
-        y      = type_convert<half2_t>(t);
+        y      = type_convert<fp16x2_t>(t);
     }
+#endif
+
+    CK_TILE_HOST_DEVICE constexpr void operator()(fp16x2_t& y, const pk_int4_t& x) const
+    {
+        uint8_t x_u8 = bit_cast<uint8_t>(x);
+        uint8_t x_l  = (x_u8 & 0x0f) >> 0;
+        uint8_t x_h  = (x_u8 & 0xf0) >> 4;
+
+        y.lo = type_convert<half_t>(x_l);
+        y.hi = type_convert<half_t>(x_h);
+    }
+
     constexpr const static bool is_pack2_invocable = true;
 };
-#endif
 
 struct PassThrough
 {
diff --git a/include/ck_tile/ops/fmha/block/block_masking.hpp b/include/ck_tile/ops/fmha/block/block_masking.hpp
index 1569c93565..726543b97a 100644
--- a/include/ck_tile/ops/fmha/block/block_masking.hpp
+++ b/include/ck_tile/ops/fmha/block/block_masking.hpp
@@ -310,7 +310,7 @@ struct SimplifiedGenericAttentionMask
 
         const index_t x_per_split = ck_tile::max(1, integer_divide_ceil(x_total, num_splits));
         const index_t split_start = x_per_split * i_split;
-        const index_t split_end   = split_start + x_per_split;
+        const index_t split_end   = ck_tile::min(x_total, split_start + x_per_split);
 
         return ck_tile::make_tuple(ck_tile::max(origin_start, split_start),
                                    ck_tile::min(origin_end, split_end));
diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
index 92dc2bac3f..14d0596287 100644
--- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
+++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_splitkv_kernel.hpp
@@ -742,7 +742,7 @@ struct FmhaFwdSplitKVKernel
                 return pad_tensor_view(
                     v_dram_transposed,
                     make_tuple(number<FmhaPipeline::kN1>{}, number<FmhaPipeline::kK1>{}),
-                    sequence<kPadHeadDimV, false>{});
+                    sequence<kPadHeadDimV, kPadSeqLenK>{});
             }
             else
             {
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 6d03e233c6..830662216a 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -101,7 +101,7 @@ namespace ck_tile {
 //   max_num_tokens_padded: opk_ids.numel() + num_experts * (block_size - 1)
 
 
-CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int num_tokens_, int num_experts_)
+CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int tokens_, int num_experts_)
 {
     /*               num_experts + 1
     *   +--------------------------------------+
@@ -132,7 +132,7 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int num_tokens_, int nu
 
         // round to sub_unroll multipl
         int r_for_sub_token = r - cumsum_bufs;
-        r_for_sub_token = min(r_for_sub_token, num_tokens_);
+        r_for_sub_token = min(r_for_sub_token, tokens_);
         r_for_sub_token = (r_for_sub_token + sub_unroll - 1) / sub_unroll * sub_unroll;
         r_for_sub_token = max(r_for_sub_token, 1);
 
@@ -148,7 +148,6 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int num_tokens_, int nu
             
             mask_ = mask_ > 0b111 ? 0b111 : mask_;  //clamp to 8x at most
             mask_ = ~mask_;
-            //printf("r_unroll_:%d, clz:%d, mask:%x\n", r_unroll_, clz_, mask_); fflush(stdout);
 
             r_for_sub_token = (r_unroll_ & mask_) * sub_unroll;
         }
@@ -161,11 +160,17 @@ CK_TILE_HOST constexpr auto moe_sorting_get_smem_row_col(int num_tokens_, int nu
         return r_for_sub_token + cumsum_bufs;
     }();
 
-    // printf("r:%d, c:%d\n", smem_rows, smem_cols);
-
     return ck_tile::make_tuple(smem_rows, smem_cols);
 }
 
+CK_TILE_HOST index_t moe_sorting_get_sub_token(int tokens_, int num_experts_)
+{
+    auto [r_, c_]            = moe_sorting_get_smem_row_col(tokens_, num_experts_);
+    auto sub_token_          = r_ - 2;
+    (void) c_;
+    return sub_token_;
+}
+
 struct MoeSortingHostArgs
 {
     const void* p_topk_ids;     // [token, topk]
@@ -180,6 +185,9 @@ struct MoeSortingHostArgs
     // we fused the setzero of output of fused-moe buffer
     // set this pointer to nullptr will skip this operation
     void* p_moe_buf;
+    void* p_ws;             // size is moe_sorting_get_workspace_size()
+                            // if return zero, then could be nullptr
+                            // must be cleard before use
     index_t tokens;
     index_t unit_size;      // this is the M_a of fused-moe kernel
     index_t num_experts;
@@ -1056,6 +1064,812 @@ struct MoeSortingKernel
     }
 };
 
+namespace impl {
+
+// [expert, padded_tokens]
+CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_stride(index_t tokens)
+{
+    constexpr index_t chunk = 32;
+    return (tokens + chunk - 1) / chunk * chunk;
+};
+
+CK_TILE_HOST_DEVICE index_t moe_sorting_mp_mesh_elem(index_t tokens, index_t num_experts)
+{
+    index_t row_size = moe_sorting_mp_mesh_stride(tokens);
+    return num_experts * row_size;
+};
+
+CK_TILE_HOST_DEVICE index_t moe_sorting_mp_cumsum_elem(index_t num_experts)
+{
+    constexpr index_t chunk = 32;
+    index_t row_size        = num_experts + 1;
+    return (row_size + chunk - 1) / chunk * chunk;
+};
+
+template <typename T, typename F, index_t wave_size_ = warpSize>
+CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
+{
+    // constexpr int wave_size = 64;
+    // constexpr int reduce_stage = 6; // 1<<6=64
+    // clang-format off
+    constexpr int reduce_stage = [](){
+        if constexpr(wave_size_ == 2) return 1;
+        else if constexpr(wave_size_ == 4) return 2;
+        else if constexpr(wave_size_ == 8) return 3;
+        else if constexpr(wave_size_ == 16) return 4;
+        else if constexpr(wave_size_ == 32) return 5;
+        else if constexpr(wave_size_ == 64) return 6;
+        else return 0;
+    }();
+    // clang-format on
+    T v_local = local;
+#pragma unroll reduce_stage
+    for(int i_stage = 0; i_stage < reduce_stage; i_stage++)
+    {
+        int src_lane = __lane_id() ^ (1 << i_stage);
+        int32_t v_remote_tmp =
+            __builtin_amdgcn_ds_bpermute(src_lane << 2, bit_cast<int32_t>(v_local));
+        T v_remote = bit_cast<T>(v_remote_tmp);
+        v_local    = reduce_f(v_local, v_remote);
+    }
+    return v_local;
+}
+
+// [a, b, c, d....] -> [a, a+b, a+b+c, a+b+c+d, ....]
+// NOTE: wave_size need at least be 16!! dpp 16 is one row
+template <typename data_t, int wave_size>
+CK_TILE_DEVICE void moe_sorting_wave_cumsum(data_t& thread_data)
+{
+    // wave_size must be power of 2
+    constexpr int row_mask    = 0xf;
+    constexpr int bank_mask   = 0xf;
+    constexpr bool bound_ctrl = true; // ! out-of-bound is zero !
+    auto reduce_op            = [&](auto x_, auto y_) { return x_ + y_; };
+
+    if constexpr(wave_size > 1)
+    {
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x111,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:1
+    }
+
+    if constexpr(wave_size > 2)
+    {
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x112,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:2
+    }
+    if constexpr(wave_size > 4)
+    {
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x114,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:4
+    }
+    if constexpr(wave_size == 8)
+    {
+
+        // wave-size=8 need one extra shift
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x118,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:8
+#if 0
+        constexpr int bank_mask_0_7 = 0b1100;
+        auto reduce_op_r = [&](auto x_, auto y_) { return x_ - y_; };
+        thread_data = reduce_op_r(thread_data, __builtin_bit_cast(data_t,
+                                                __builtin_amdgcn_update_dpp(0, /* old value */
+                                                    __builtin_bit_cast(int, thread_data),
+                                                    0x157,
+                                                    row_mask,
+                                                    bank_mask_0_7,
+                                                    bound_ctrl))// row_newbcast:7
+                                                    );
+#else
+        data_t xxx =
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x157,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl)); // row_newbcast:7
+
+        data_t yyy  = (__lane_id() / 8) % 2 == 0 ? 0 : xxx;
+        thread_data = thread_data - yyy;
+#endif
+    }
+    if constexpr(wave_size > 8)
+    {
+        thread_data = reduce_op(
+            thread_data,
+            __builtin_bit_cast(data_t,
+                               __builtin_amdgcn_mov_dpp(__builtin_bit_cast(int, thread_data),
+                                                        0x118,
+                                                        row_mask,
+                                                        bank_mask,
+                                                        bound_ctrl))); // row_shr:8
+    }
+
+    if constexpr(wave_size > 16)
+    {
+        // now row-0, row-0+row-1, row-1+row-2, row-2+row-3
+        int v_remote_tmp = __builtin_amdgcn_ds_bpermute(((__lane_id() & 0x30) - 1) << 2,
+                                                        __builtin_bit_cast(int, thread_data));
+        v_remote_tmp     = __lane_id() >= 16 ? v_remote_tmp : 0;
+        thread_data      = reduce_op(thread_data, __builtin_bit_cast(data_t, v_remote_tmp));
+    }
+
+    if constexpr(wave_size > 32)
+    {
+        // lane-id 48...63->31
+        int v_remote_tmp = __builtin_amdgcn_ds_bpermute(((__lane_id() & 0x30) - 17) << 2,
+                                                        __builtin_bit_cast(int, thread_data));
+        v_remote_tmp     = __lane_id() >= 32 ? v_remote_tmp : 0;
+        thread_data      = reduce_op(thread_data, __builtin_bit_cast(data_t, v_remote_tmp));
+    }
+}
+
+template <index_t BLOCK_SIZE = 256>
+CK_TILE_DEVICE void moe_buf_set_zero_kernel(uint8x16_t* buf, index_t buf_bytes, index_t gid)
+{
+    // const index_t offset = (blockIdx.x - 1) * BLOCK_SIZE + threadIdx.x;
+    index_t offset = gid * BLOCK_SIZE + threadIdx.x;
+    if(offset < buf_bytes / 16)
+    {
+        buf[offset] = uint8x16_t{0};
+    }
+}
+
+} // namespace impl
+
+// prefer to run mp kernel if is not oneshot
+CK_TILE_HOST bool moe_sorting_is_oneshot(int tokens_, int num_experts_)
+{
+    auto sub_token_          = moe_sorting_get_sub_token(tokens_, num_experts_);
+    bool is_sub_token_onshot = tokens_ <= sub_token_;
+    return is_sub_token_onshot;
+}
+
+// return size in byte
+CK_TILE_HOST index_t moe_sorting_mp_get_workspace_size(int tokens_, int num_experts_)
+{
+    index_t elem = impl::moe_sorting_mp_mesh_elem(tokens_, num_experts_) +
+                   impl::moe_sorting_mp_cumsum_elem(num_experts_);
+    return elem * sizeof(index_t);
+}
+
+// return size in byte
+CK_TILE_HOST index_t moe_sorting_get_workspace_size(int tokens_, int num_experts_)
+{
+#if 1
+    if(moe_sorting_is_oneshot(tokens_, num_experts_))
+    {
+        return 0;
+    }
+    else
+    {
+        return moe_sorting_mp_get_workspace_size(tokens_, num_experts_);
+    }
+#else
+    return moe_sorting_mp_get_workspace_size(tokens_, num_experts_);
+#endif
+}
+
+// below kernel is multi-phase implementation for large token and/or expert case
+
+// write into a buffer to record the token cnt
+// e.g. num_experts = 6, topk=3, M_a = 4, input_tokens = 5
+// before sort, topk_ids is : [[0, 3, 5], [2, 3, 5], [1, 3, 5], [1, 2, 3], [1, 3, 5]]
+//                            tok-0      tok-1      tok-2      tok-3      tok-4
+//           topk_weight is : [[a, b, c], [d, e, f], [g, h, i], [j, k, l], [m, n, o]] (some float
+//           number)
+//
+// token_id_per_expert is : [[0], [2, 3, 4], [1, 3], [0, 1, 2, 3, 4], [], [0, 1, 2, 5]]
+//  (only for reference)    exp-0  exp-1     exp-2   exp-3          exp-4  exp-5
+// weight_id_per_expert is: [[a], [g, j, m], [d, k], [b, e, h, l, n], [], [c, f, i, o]]
+/*
+
+p_expert_mesh:
+     t0 t1 t2 t3 t4 r5
+    +--+--+--+--+--+--+
+e0  | 1|  |  |  |  |  |
+e1  |  |  | 1| 1| 1|  |
+e2  |  | 1|  | 1|  |  |
+e3  | 1| 1| 1| 1| 1|  |
+e4  |  |  |  |  |  |  |
+e5  | 1| 1| 1|  |  | 1|
+
+
+p_expert_cumsum:
+    | 1| 3| 2| 5| 0| 4|
+     e0 e1 e2 e3 e4 e5
+
+p_expert_cumsum(with M_a pad, and skip zero tokens):
+    | 4| 4| 4| 8| 0| 4|
+     e0 e1 e2 e3 e4 e5
+
+p_expert_cumsum
+    | 0| 4| 8|12|20|20|24|
+
+local_expert_mask : [1, 0, 1, 1, 0, 1] (mask out expert-id=1, 4)
+
+p_m_cumsum
+    | 0| 1| 1| 2| 3| 3| 4|
+
+*/
+
+// count topk_id into mesh
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P0
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+
+    struct Kargs
+    {
+        const void* p_topk_ids; // [tokens, topk]
+        void* p_expert_mesh;    // [expert, tokens]
+        index_t tokens;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+        mdiv topk_mdiv;
+    };
+
+    CK_TILE_HOST static constexpr auto get_num_cu()
+    {
+        index_t num_cu = [&]() {
+            hipDeviceProp_t dev_prop;
+            hipDevice_t dev;
+            HIP_CHECK_ERROR(hipGetDevice(&dev));
+            HIP_CHECK_ERROR(hipGetDeviceProperties(&dev_prop, dev));
+            return dev_prop.multiProcessorCount;
+        }();
+        return num_cu;
+    }
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_topk_ids    = h.p_topk_ids;
+        k.p_expert_mesh = h.p_ws;
+        k.tokens        = h.tokens;
+        k.mesh_stride   = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        k.topk_mdiv     = mdiv{static_cast<uint32_t>(h.topk)};
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs&) { return get_num_cu() * OCCUPANCY; }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // in byte
+    CK_TILE_HOST static constexpr auto GetSmemSize() { return 0; }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        using topk_id_t = ext_vector_t<IndexType, Problem::SubTokenTile>;
+
+        static_assert(Problem::SubTokenTile == 1 || Problem::SubTokenTile == 2 ||
+                      Problem::SubTokenTile == 4);
+
+        const topk_id_t* p_topk_ids = reinterpret_cast<const topk_id_t*>(kargs.p_topk_ids);
+        IndexType* p_expert_mesh    = reinterpret_cast<IndexType*>(kargs.p_expert_mesh);
+        index_t total_elem = kargs.tokens * kargs.topk_mdiv.divisor / Problem::SubTokenTile;
+
+#pragma unroll Problem::SubTokenTile
+        for(index_t i = blockIdx.x * BLOCK_SIZE + threadIdx.x; i < total_elem; i += blockDim.x)
+        {
+            auto x = p_topk_ids[i];
+            static_for<0, Problem::SubTokenTile, 1>{}([&](auto j) {
+                IndexType eid = x[j.value]; // ext_vector_type must use int to []
+                uint32_t curr_token_id, curr_topk_id;
+                kargs.topk_mdiv.divmod(i * Problem::SubTokenTile + j, curr_token_id, curr_topk_id);
+                p_expert_mesh[eid * kargs.mesh_stride + curr_token_id] = curr_topk_id + 1;
+            });
+        }
+    }
+};
+
+// cnt total tokens for a expert
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P1
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+    struct Kargs
+    {
+        const void* p_local_expert_mask; // [expert]
+        void* p_expert_mesh;             // [expert, tokens]
+        void* p_expert_cumsum;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+    };
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_expert_mesh       = h.p_ws;
+        k.p_expert_cumsum =
+            reinterpret_cast<void*>(reinterpret_cast<IndexType*>(h.p_ws) +
+                                    impl::moe_sorting_mp_mesh_elem(h.tokens, h.num_experts));
+        k.mesh_stride = impl::moe_sorting_mp_mesh_stride(h.tokens);
+
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h) { return dim3(h.num_experts); }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
+    {
+        return BLOCK_SIZE / warpSize * sizeof(IndexType);
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        __shared__ char smem[GetSmemSize()];
+
+        int eid = blockIdx.x;
+
+        constexpr index_t index_pack = 4;                                   // always packed
+        using r_t                    = ext_vector_t<IndexType, index_pack>; // always use int32x4
+        r_t* p_expert_mesh           = reinterpret_cast<r_t*>(
+            reinterpret_cast<index_t*>(kargs.p_expert_mesh) + eid * kargs.mesh_stride);
+
+        static_assert(Problem::SubTokenTile == 1 || Problem::SubTokenTile == 2 ||
+                      Problem::SubTokenTile == 4);
+        const IndexType* p_local_expert_mask =
+            static_cast<const IndexType*>(kargs.p_local_expert_mask);
+        IndexType* p_expert_cumsum = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
+
+        auto f_sum = [](auto x_, auto y_) { return x_ + y_; };
+
+        int loops = (kargs.mesh_stride / index_pack + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+        if constexpr(Problem::LocalExpertMasking)
+        {
+            IndexType mask = p_local_expert_mask[eid];
+            if(mask == 0)
+                return; // skip
+        }
+
+        index_t cnt = 0; // per-wave cnt
+        for(int i = 0; i < loops; i++)
+        {
+            int position = i * BLOCK_SIZE + threadIdx.x;
+            r_t v{0};
+            if(position < (kargs.mesh_stride / index_pack))
+                v = p_expert_mesh[position];
+            index_t local_sum = 0;
+            static_for<0, index_pack, 1>{}(
+                [&](auto i_vec) { local_sum += v[i_vec.value] != 0 ? 1 : 0; });
+            cnt += impl::moe_sorting_wave_reduce(local_sum, f_sum);
+        }
+
+        index_t lane_id = threadIdx.x % warpSize;
+        index_t wave_id = threadIdx.x / warpSize;
+
+        // reduce cross wave
+        IndexType* s = reinterpret_cast<IndexType*>(smem);
+        if(lane_id == 0)
+        {
+            s[wave_id] = cnt;
+        }
+        __syncthreads();
+
+        if(threadIdx.x == 0)
+        {
+            index_t c = 0;
+            for(auto i = 0; i < (BLOCK_SIZE / warpSize); i++)
+            {
+                c += s[i];
+            }
+            p_expert_cumsum[eid] = c;
+        }
+    }
+};
+
+// token count cumsum
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P2
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+    struct Kargs
+    {
+        const void* p_local_expert_mask; // [expert]
+        void* p_expert_mesh;             // [expert, tokens]
+        void* p_expert_cumsum;           // [expert + 1]
+        void* p_total_tokens_post_pad;   // [1]
+        void* p_sorted_expert_ids;
+        void* p_moe_buf;
+        index_t tokens;
+        index_t num_experts;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+        mdiv unit_size_mdiv;
+        index_t moe_buf_bytes;
+    };
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_local_expert_mask = h.p_local_expert_mask;
+        // k.p_expert_mesh                    = h.p_ws;
+        k.p_expert_cumsum =
+            reinterpret_cast<void*>(reinterpret_cast<IndexType*>(h.p_ws) +
+                                    impl::moe_sorting_mp_mesh_elem(h.tokens, h.num_experts));
+        k.p_total_tokens_post_pad = h.p_total_tokens_post_pad;
+        k.p_sorted_expert_ids     = h.p_sorted_expert_ids;
+
+        k.p_moe_buf = h.p_moe_buf;
+
+        k.tokens         = h.tokens;
+        k.num_experts    = h.num_experts;
+        k.mesh_stride    = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        k.unit_size_mdiv = mdiv{static_cast<uint32_t>(h.unit_size)};
+
+        k.moe_buf_bytes = h.moe_buf_bytes;
+
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h)
+    {
+        // use 1 block to cumsum
+        return dim3(1 + ck_tile::integer_divide_ceil(h.moe_buf_bytes, BLOCK_SIZE * 16));
+    }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
+    {
+        return 2 * BLOCK_SIZE * sizeof(IndexType);
+    }
+
+    // reduce single pixel within a wave
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        if(blockIdx.x > 0)
+        {
+            impl::moe_buf_set_zero_kernel<BLOCK_SIZE>(
+                reinterpret_cast<uint8x16_t*>(kargs.p_moe_buf),
+                kargs.moe_buf_bytes,
+                blockIdx.x - 1);
+            return;
+        }
+        __shared__ char smem[GetSmemSize()];
+        IndexType* s = reinterpret_cast<IndexType*>(smem);
+
+        const IndexType* p_local_expert_mask =
+            static_cast<const IndexType*>(kargs.p_local_expert_mask);
+        IndexType* p_expert_cumsum = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
+        IndexType* p_total_tokens_post_pad =
+            reinterpret_cast<IndexType*>(kargs.p_total_tokens_post_pad);
+        IndexType* p_sorted_expert_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_expert_ids);
+
+        const index_t loops = (kargs.num_experts + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        index_t wave_id     = threadIdx.x / warpSize;
+        index_t lane_id     = threadIdx.x % warpSize;
+
+        IndexType prev_cumsum_a = 0;
+        IndexType prev_cumsum_b = 0;
+
+        for(index_t i = 0; i < loops; i++)
+        {
+            index_t position = i * BLOCK_SIZE + threadIdx.x;
+            IndexType a_     = 0; // token count for a expert
+            IndexType b_     = 0; // mask for a expert
+            if(position < kargs.num_experts)
+            {
+                a_ = p_expert_cumsum[position];
+                if constexpr(Problem::LocalExpertMasking)
+                    b_ = p_local_expert_mask[position];
+            }
+
+            int blocks_pers_expert =
+                kargs.unit_size_mdiv.div(a_ + kargs.unit_size_mdiv.divisor - 1);
+            // pad token
+            int padded_blocks_per_expert = [&]() {
+                int x_ = [&]() {
+                    if constexpr(Problem::SkipExpertsWithZeroTokens)
+                    {
+                        // if local_cnt is zero, blocks_pers_expert will be zero
+                        // this is what we want to achieve
+                        return blocks_pers_expert; //  * kargs.unit_size_mdiv.divisor;
+                    }
+                    else
+                    {
+                        return max(blocks_pers_expert, 1);
+                    }
+                }();
+                if constexpr(Problem::LocalExpertMasking)
+                {
+                    return b_ ? x_ : 0;
+                }
+                else
+                    return x_;
+            }();
+
+            IndexType cumsum_a = padded_blocks_per_expert;
+            IndexType cumsum_b = b_;
+
+            // Note: we first cumsum local round, then add previous cumsum
+            impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_a);
+            impl::moe_sorting_wave_cumsum<IndexType, warpSize>(cumsum_b);
+
+            __syncthreads();
+            if(lane_id == warpSize - 1)
+            {
+                s[4 + wave_id]                         = cumsum_a;
+                s[4 + wave_id + BLOCK_SIZE / warpSize] = cumsum_b;
+            }
+
+            __syncthreads();
+
+            // reduce cross wave
+            static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                IndexType prev_a = s[4 + i_w];
+                IndexType prev_b = s[4 + i_w + BLOCK_SIZE / warpSize];
+                prev_a           = wave_id > i_w ? prev_a : 0; // mask out
+                prev_b           = wave_id > i_w ? prev_b : 0; // mask out
+                cumsum_a += prev_a;
+                cumsum_b += prev_b;
+            });
+
+            // Now let's add previous cumsum
+            cumsum_a += prev_cumsum_a;
+            cumsum_b += prev_cumsum_b;
+
+            if(threadIdx.x == BLOCK_SIZE - 1)
+            {
+                s[2] = cumsum_a; // store the last cumsum
+                s[3] = cumsum_b;
+            }
+
+            IndexType out_0 = cumsum_a - padded_blocks_per_expert; // exclusive cumsum tok cnt
+            IndexType out_1 = cumsum_b - b_;                       // exclusive cumsum mask cnt
+
+            __syncthreads();
+            prev_cumsum_a = s[2];
+            prev_cumsum_b = s[3];
+
+            if(position < kargs.num_experts)
+            {
+                p_expert_cumsum[position] = out_0 * kargs.unit_size_mdiv.divisor;
+            }
+
+            {
+                if constexpr(Problem::LocalExpertMasking)
+                {
+                    if(b_)
+                    {
+                        for(int j = 0; j < blocks_pers_expert; j++)
+                        {
+                            p_sorted_expert_ids[out_0 + j] = out_1;
+                        }
+                    }
+                }
+                else
+                {
+                    for(int j = 0; j < blocks_pers_expert; j++)
+                    {
+                        p_sorted_expert_ids[out_0 + j] = position;
+                    }
+                }
+            }
+        }
+
+        if(threadIdx.x == 0)
+        {
+            auto total_tokens_post_pad         = prev_cumsum_a * kargs.unit_size_mdiv.divisor;
+            p_total_tokens_post_pad[0]         = total_tokens_post_pad;
+            p_expert_cumsum[kargs.num_experts] = total_tokens_post_pad;
+        }
+    }
+};
+
+template <typename Problem_>
+struct MoeSortingMultiPhaseKernel_P3
+{
+    using Problem = remove_cvref_t<Problem_>;
+
+    using IndexType  = typename Problem::IndexType;
+    using WeightType = typename Problem::WeightType;
+
+    static constexpr index_t BLOCK_SIZE = 256;
+    static constexpr index_t OCCUPANCY  = 2; // hard coded
+
+    typedef MoeSortingHostArgs MoeSortingKargs;
+
+    using Hargs = MoeSortingHostArgs;
+
+    struct Kargs
+    {
+        const void* p_weights;
+        const void* p_local_expert_mask;
+        void* p_sorted_token_ids;
+        void* p_sorted_weights;
+        void* p_expert_mesh; // [token, expert]
+        void* p_expert_cumsum;
+
+        index_t tokens;
+        index_t num_experts;
+        index_t mesh_stride; // mesh_stride for p_expert_mesh
+        mdiv topk_mdiv;
+    };
+
+    CK_TILE_HOST static constexpr auto MakeKargs(const Hargs& h)
+    {
+        Kargs k;
+        k.p_weights           = h.p_weights;
+        k.p_local_expert_mask = h.p_local_expert_mask;
+        k.p_sorted_token_ids  = h.p_sorted_token_ids;
+        k.p_sorted_weights    = h.p_sorted_weights;
+        k.p_expert_mesh       = h.p_ws;
+        k.p_expert_cumsum =
+            reinterpret_cast<void*>(reinterpret_cast<IndexType*>(h.p_ws) +
+                                    impl::moe_sorting_mp_mesh_elem(h.tokens, h.num_experts));
+        k.tokens      = h.tokens;
+        k.num_experts = h.num_experts;
+        k.topk_mdiv   = mdiv{static_cast<uint32_t>(h.topk)};
+        k.mesh_stride = impl::moe_sorting_mp_mesh_stride(h.tokens);
+        return k;
+    }
+
+    CK_TILE_HOST static constexpr auto GridSize(const Hargs& h) { return dim3(h.num_experts); }
+
+    CK_TILE_HOST static constexpr auto BlockSize(const Hargs&) { return dim3(BLOCK_SIZE); }
+
+    // in byte
+    CK_TILE_HOST_DEVICE static constexpr auto GetSmemSize()
+    {
+        return (4 + BLOCK_SIZE / warpSize) * sizeof(IndexType);
+    }
+
+    CK_TILE_DEVICE void operator()(Kargs kargs) const
+    {
+        __shared__ char smem[GetSmemSize()];
+
+        const IndexType* p_local_expert_mask =
+            static_cast<const IndexType*>(kargs.p_local_expert_mask);
+        IndexType* s                  = reinterpret_cast<IndexType*>(smem);
+        IndexType* p_expert_mesh      = reinterpret_cast<IndexType*>(kargs.p_expert_mesh);
+        IndexType* p_sorted_token_ids = reinterpret_cast<IndexType*>(kargs.p_sorted_token_ids);
+        IndexType* p_expert_cumsum    = reinterpret_cast<IndexType*>(kargs.p_expert_cumsum);
+        const WeightType* p_weights   = static_cast<const WeightType*>(kargs.p_weights);
+        WeightType* p_sorted_weights  = reinterpret_cast<WeightType*>(kargs.p_sorted_weights);
+
+        static_assert(Problem::SubTokenTile == 1 || Problem::SubTokenTile == 2 ||
+                      Problem::SubTokenTile == 4);
+
+        int eid     = blockIdx.x;
+        int wave_id = threadIdx.x / warpSize;
+        int lane_id = threadIdx.x % warpSize;
+        int e_start = p_expert_cumsum[eid];
+        int e_end   = p_expert_cumsum[eid + 1];
+        if constexpr(Problem::SkipExpertsWithZeroTokens)
+        {
+            if(e_start == e_end)
+                return;
+        }
+
+        if constexpr(Problem::LocalExpertMasking)
+        {
+            int e_mask = p_local_expert_mask[eid];
+            if(e_mask == 0)
+                return; // skip empty expert
+        }
+
+        // cumsum one by one
+        int loops       = (kargs.mesh_stride + BLOCK_SIZE - 1) / BLOCK_SIZE;
+        int prev_cumsum = 0;
+        for(int i = 0; i < loops; i++)
+        {
+            int i_token = i * BLOCK_SIZE + threadIdx.x;
+            IndexType x = 0;
+            if(i_token < kargs.tokens)
+            {
+                x = p_expert_mesh[eid * kargs.mesh_stride + i_token];
+            }
+            int i_topk = x - 1;          // topk of this token
+            int i_show = x != 0 ? 1 : 0; // has this token or not
+            int cumsum = i_show;
+            impl::moe_sorting_wave_cumsum<int, warpSize>(cumsum);
+
+            __syncthreads();
+            if(lane_id == warpSize - 1)
+            {
+                s[4 + wave_id] = cumsum;
+            }
+            __syncthreads();
+
+            // reduce cross wave
+            static_for<0, BLOCK_SIZE / warpSize - 1, 1>{}([&](auto i_w) {
+                IndexType prev = s[4 + i_w];
+                prev           = wave_id > i_w ? prev : 0; // mask out
+                cumsum += prev;
+            });
+            cumsum += prev_cumsum; // add previous round cumsum
+            if(threadIdx.x == BLOCK_SIZE - 1)
+            {
+                s[0] = cumsum;
+            }
+            __syncthreads();
+
+            int position = cumsum - i_show;
+            prev_cumsum  = s[0]; // update the last cumsum
+
+            if(i_show)
+            {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+                p_sorted_token_ids[e_start + position] = MOE_SORTING_MOCK_ID(i_token, i_topk);
+#else
+                p_sorted_token_ids[e_start + position] = i_token;
+#endif
+                p_sorted_weights[e_start + position] =
+                    p_weights[i_token * kargs.topk_mdiv.divisor + i_topk];
+            }
+        }
+
+        for(index_t i = e_start + prev_cumsum + threadIdx.x; i < e_end; i += BLOCK_SIZE)
+        {
+#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
+            p_sorted_token_ids[i] = MOE_SORTING_MOCK_ID(kargs.tokens, kargs.topk_mdiv.divisor);
+#else
+            p_sorted_token_ids[i] = tokens;
+#endif
+            p_sorted_weights[i] = static_cast<WeightType>(0.0);
+        }
+    }
+};
+
 #undef MOE_SORTING_MOCK_ID
 
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
index 15effe7118..a98e0d7652 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_problem.hpp
@@ -49,4 +49,21 @@ struct MoeSortingProblemEx
     static constexpr index_t ExpertTile = ExpertTile_; // TODO: only used in store out
 };
 
+template <typename IndexType_,
+          typename WeightType_,
+          index_t SubTokenTile_,    // 1,2,4
+          bool LocalExpertMasking_, // used in EP case
+          bool SkipExpertsWithZeroTokens_ = true>
+struct MoeSortingProblemMp
+{
+    // TODO: this kernel only support warp per row
+    using WeightType = remove_cvref_t<WeightType_>;
+    using IndexType  = remove_cvref_t<IndexType_>;
+
+    static constexpr index_t SubTokenTile           = SubTokenTile_;
+    static constexpr bool LocalExpertMasking        = LocalExpertMasking_;
+    static constexpr bool SkipExpertsWithZeroTokens = SkipExpertsWithZeroTokens_;
+    static_assert(SubTokenTile == 1 || SubTokenTile == 2 || SubTokenTile == 4);
+};
+
 } // namespace ck_tile
diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
index ab21398b99..d9d6739fb5 100644
--- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
+++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp
@@ -1,11 +1,12 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
 #include "ck_tile/core.hpp"
 #include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
 #include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
+#include "ck_tile/ops/elementwise.hpp"
 
 namespace ck_tile {
 
@@ -20,12 +21,13 @@ struct BlockUniversalGemmAsBsCr
     template <typename PipelineProblem_, typename GemmPolicy_>
     struct GemmTraits_
     {
-        using Problem        = remove_cvref_t<PipelineProblem_>;
-        using Policy         = remove_cvref_t<GemmPolicy_>;
-        using ADataType      = remove_cvref_t<typename Problem::ADataType>;
-        using BDataType      = remove_cvref_t<typename Problem::BDataType>;
-        using CDataType      = remove_cvref_t<typename Problem::CDataType>;
-        using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
+        using Problem         = remove_cvref_t<PipelineProblem_>;
+        using Policy          = remove_cvref_t<GemmPolicy_>;
+        using ADataType       = remove_cvref_t<typename Problem::ADataType>;
+        using BDataType       = remove_cvref_t<typename Problem::BDataType>;
+        using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
+        using CDataType       = remove_cvref_t<typename Problem::CDataType>;
+        using BlockGemmShape  = remove_cvref_t<typename Problem::BlockGemmShape>;
 
         static constexpr index_t kBlockSize = Problem::kBlockSize;
         static constexpr auto Scheduler     = Problem::Scheduler;
@@ -71,10 +73,10 @@ struct BlockUniversalGemmAsBsCr
         using BWarpTileDistr = remove_cvref_t<decltype(make_static_tile_distribution(
             typename WarpGemm::BWarpDstrEncoding{}))>;
 
-        using AWarpTile =
-            remove_cvref_t<decltype(make_static_distributed_tensor<ADataType>(AWarpTileDistr{}))>;
-        using BWarpTile =
-            remove_cvref_t<decltype(make_static_distributed_tensor<BDataType>(BWarpTileDistr{}))>;
+        using AWarpTile = remove_cvref_t<decltype(make_static_distributed_tensor<ComputeDataType>(
+            AWarpTileDistr{}))>;
+        using BWarpTile = remove_cvref_t<decltype(make_static_distributed_tensor<ComputeDataType>(
+            BWarpTileDistr{}))>;
 
         // TODO: Should we have two policies? Interwave & Intrawave ??
         static constexpr index_t InterWaveSchedulingMacClusters = 1;
@@ -90,9 +92,10 @@ struct BlockUniversalGemmAsBsCr
     public:
     using Traits = GemmTraits_<Problem_, Policy_>;
 
-    using ADataType = remove_cvref_t<typename Traits::ADataType>;
-    using BDataType = remove_cvref_t<typename Traits::BDataType>;
-    using CDataType = remove_cvref_t<typename Traits::CDataType>;
+    using ADataType       = remove_cvref_t<typename Traits::ADataType>;
+    using BDataType       = remove_cvref_t<typename Traits::BDataType>;
+    using ComputeDataType = remove_cvref_t<typename Traits::ComputeDataType>;
+    using CDataType       = remove_cvref_t<typename Traits::CDataType>;
 
     using WarpGemm = remove_cvref_t<typename Traits::WarpGemm>;
 
@@ -105,10 +108,34 @@ struct BlockUniversalGemmAsBsCr
 
     static constexpr auto Scheduler = Traits::Scheduler;
 
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
     using I0 = number<0>;
     using I1 = number<1>;
 
     private:
+    template <typename WarpWindow, typename WarpTile>
+    CK_TILE_DEVICE static void load_interleaved_pk_type(const WarpWindow& warp_window,
+                                                        WarpTile& warp_tile)
+    {
+        constexpr index_t UnaryOpSize = 8;
+        const element_wise::PassThroughPack8 elementwise_op{};
+        constexpr index_t thread_buffer_size =
+            Traits::AWarpTile::get_thread_buffer_size() / UnaryOpSize;
+        const auto in_dstr_tensors = load_tile(warp_window);
+
+        static_assert(Traits::AWarpTile::get_thread_buffer_size() % UnaryOpSize == 0);
+
+        using ComputeVectorType = ComputeDataType __attribute__((ext_vector_type(UnaryOpSize)));
+        static_for<0, thread_buffer_size, 1>{}([&](auto i) {
+            elementwise_op(warp_tile.get_thread_buffer().template get_as<ComputeVectorType>()(i),
+                           in_dstr_tensors.get_thread_buffer().template get_as<pk_int4x4_t>()[i]);
+        });
+    }
+
     template <GemmPipelineScheduler Scheduler, typename GemmTraits>
     struct BlockGemmImpl
     {
@@ -208,6 +235,8 @@ struct BlockUniversalGemmAsBsCr
             });
 
             using CWarpDstr   = typename WarpGemm::CWarpDstr;
+            using AWarpTensor = typename WarpGemm::AWarpTensor;
+            using BWarpTensor = typename WarpGemm::BWarpTensor;
             using CWarpTensor = typename WarpGemm::CWarpTensor;
 
             constexpr auto c_warp_y_lengths =
@@ -217,10 +246,26 @@ struct BlockUniversalGemmAsBsCr
             // hot loop:
             static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) {
                 static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    const auto a_warp_tile = load_tile(a_warp_windows(mIter)(kIter));
+                    AWarpTensor a_warp_tile;
+                    if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(a_warp_windows(mIter)(kIter), a_warp_tile);
+                    }
+                    else
+                    {
+                        a_warp_tile = load_tile(a_warp_windows(mIter)(kIter));
+                    }
 
                     static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
-                        const auto b_warp_tile = load_tile(b_warp_windows(nIter)(kIter));
+                        BWarpTensor b_warp_tile;
+                        if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+                        {
+                            load_interleaved_pk_type(b_warp_windows(nIter)(kIter), b_warp_tile);
+                        }
+                        else
+                        {
+                            b_warp_tile = load_tile(b_warp_windows(nIter)(kIter));
+                        }
 
                         // read C warp tensor from C block tensor-
                         CWarpTensor c_warp_tensor;
@@ -342,11 +387,27 @@ struct BlockUniversalGemmAsBsCr
             static_for<0, KIterPerWarp, 1>{}([&](auto kIter) {
                 static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
                     // read A warp tensor from A block window
-                    load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
+                    if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(a_warp_windows(mIter)(kIter),
+                                                 a_warp_tiles_(mIter)(kIter));
+                    }
+                    else
+                    {
+                        a_warp_tiles_(mIter)(kIter) = load_tile(a_warp_windows(mIter)(kIter));
+                    }
                 });
                 static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                     // read B warp tensor from B Block window
-                    load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
+                    if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(b_warp_windows(nIter)(kIter),
+                                                 b_warp_tiles_(nIter)(kIter));
+                    }
+                    else
+                    {
+                        b_warp_tiles_(nIter)(kIter) = load_tile(b_warp_windows(nIter)(kIter));
+                    }
                 });
             });
         }
@@ -504,12 +565,27 @@ struct BlockUniversalGemmAsBsCr
             // TODO check if a_warp_tiles has same desc as a_warp_window
             static_for<0, KInnerLoopIter, 1>{}([&](auto kIter) {
                 static_for<0, MIterPerWarp, 1>{}([&](auto mIter) {
-                    // read A warp tensor from A block window
-                    load_tile(a_warp_tiles_(mIter)(kIter), a_warp_windows(mIter)(kIter));
+                    if constexpr(std::is_same_v<ADataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(a_warp_windows(mIter)(kIter),
+                                                 a_warp_tiles_(mIter)(kIter));
+                    }
+                    else
+                    {
+                        a_warp_tiles_(mIter)(kIter) = load_tile(a_warp_windows(mIter)(kIter));
+                    }
                 });
                 static_for<0, NIterPerWarp, 1>{}([&](auto nIter) {
                     // read B warp tensor from B Block window
-                    load_tile(b_warp_tiles_(nIter)(kIter), b_warp_windows(nIter)(kIter));
+                    if constexpr(std::is_same_v<BDataType, pk_int4_t>)
+                    {
+                        load_interleaved_pk_type(b_warp_windows(nIter)(kIter),
+                                                 b_warp_tiles_(nIter)(kIter));
+                    }
+                    else
+                    {
+                        b_warp_tiles_(nIter)(kIter) = load_tile(b_warp_windows(nIter)(kIter));
+                    }
                 });
             });
         }
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
index 69c50c7cd0..73d5ce8f81 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp
@@ -54,6 +54,11 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
     using CDataType      = remove_cvref_t<typename Problem::CDataType>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
     using ALayout = remove_cvref_t<typename Problem::ALayout>;
     using BLayout = remove_cvref_t<typename Problem::BLayout>;
     using CLayout = remove_cvref_t<typename Problem::CLayout>;
@@ -196,12 +201,12 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
 
             // A/B split schedule
             // compiler is likely to use ds_read2 when instruction width smaller than 16bytes
-            constexpr auto num_ds_read_inst_a = A_LDS_Read_Width * sizeof(ADataType) == 16
-                                                    ? A_LDS_Read_Inst_Num
-                                                    : A_LDS_Read_Inst_Num / 2;
-            constexpr auto num_ds_read_inst_b = B_LDS_Read_Width * sizeof(BDataType) == 16
-                                                    ? B_LDS_Read_Inst_Num
-                                                    : B_LDS_Read_Inst_Num / 2;
+            constexpr auto num_ds_read_inst_a =
+                A_LDS_Read_Width * sizeof(ADataType) / APackedSize == 16 ? A_LDS_Read_Inst_Num
+                                                                         : A_LDS_Read_Inst_Num / 2;
+            constexpr auto num_ds_read_inst_b =
+                B_LDS_Read_Width * sizeof(BDataType) / BPackedSize == 16 ? B_LDS_Read_Inst_Num
+                                                                         : B_LDS_Read_Inst_Num / 2;
 
             constexpr auto num_ds_write_inst_a = A_LDS_Write_Inst_Num;
             constexpr auto num_ds_write_inst_b = B_LDS_Write_Inst_Num;
@@ -213,9 +218,9 @@ struct GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3<Problem>
 
             constexpr auto mfma_cycle = NPerXDL == 16 ? 16 : 32;
             constexpr auto ds_read_a_issue_cycle =
-                A_LDS_Read_Width * sizeof(ADataType) == 16 ? 8 : 4;
+                A_LDS_Read_Width * sizeof(ADataType) / APackedSize == 16 ? 8 : 4;
             constexpr auto ds_read_b_issue_cycle =
-                B_LDS_Read_Width * sizeof(BDataType) == 16 ? 8 : 4;
+                B_LDS_Read_Width * sizeof(BDataType) / BPackedSize == 16 ? 8 : 4;
             constexpr auto ds_read_a_mfma_rate =
                 (mfma_cycle - 4 + 2 * ds_read_a_issue_cycle - 1) / (2 * ds_read_a_issue_cycle);
             constexpr auto ds_read_b_mfma_rate =
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
index ea8d063fd5..b679f8c8aa 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp
@@ -60,6 +60,13 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
     using CDataType      = remove_cvref_t<typename Problem::CDataType>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
+    static_assert(!std::is_same_v<BDataType, pk_int4_t>, "Not implemented");
+
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
     using ALayout = remove_cvref_t<typename Problem::ALayout>;
     using BLayout = remove_cvref_t<typename Problem::BLayout>;
     using CLayout = remove_cvref_t<typename Problem::CLayout>;
@@ -139,12 +146,12 @@ struct GemmPipelineAgBgCrCompV4 : public BaseGemmPipelineAgBgCrCompV4<Problem>
                                                 (BlockSize / WaveSize) /
                                                 (MPerXDL * NPerXDL * KPerXDL);
 
-            constexpr auto num_ds_read_inst_a = A_LDS_Read_Width * sizeof(ADataType) == 16
-                                                    ? A_LDS_Read_Inst_Num
-                                                    : A_LDS_Read_Inst_Num / 2;
-            constexpr auto num_ds_read_inst_b = B_LDS_Read_Width * sizeof(BDataType) == 16
-                                                    ? B_LDS_Read_Inst_Num
-                                                    : B_LDS_Read_Inst_Num / 2;
+            constexpr auto num_ds_read_inst_a =
+                A_LDS_Read_Width * sizeof(ADataType) / APackedSize == 16 ? A_LDS_Read_Inst_Num
+                                                                         : A_LDS_Read_Inst_Num / 2;
+            constexpr auto num_ds_read_inst_b =
+                B_LDS_Read_Width * sizeof(BDataType) / BPackedSize == 16 ? B_LDS_Read_Inst_Num
+                                                                         : B_LDS_Read_Inst_Num / 2;
 
             constexpr auto num_ds_read_inst     = num_ds_read_inst_a + num_ds_read_inst_b;
             constexpr auto num_ds_write_inst    = A_LDS_Write_Inst_Num + B_LDS_Write_Inst_Num;
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
index cde31f087b..b8b2d5b1c9 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp
@@ -21,6 +21,13 @@ struct BaseGemmPipelineAgBgCrMem
     using BDataType      = remove_cvref_t<typename Problem::BDataType>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
+    static_assert(!std::is_same_v<BDataType, pk_int4_t>, "Not implemented");
+
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
     CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; }
 
     static constexpr index_t BlockSize = Problem::kBlockSize;
@@ -33,9 +40,11 @@ struct BaseGemmPipelineAgBgCrMem
 
     static constexpr index_t WgpPerCU =
         (4 * get_warp_size() / BlockSize) >= 1 ? 4 * get_warp_size() / BlockSize : 1;
-    static constexpr index_t FullMemBandPrefetchStages = integer_divide_ceil(
-        MinMemInFlyBytes / WgpPerCU,
-        (MPerBlock * sizeof(ADataType) + NPerBlock * sizeof(BDataType)) * KPerBlock);
+    static constexpr index_t FullMemBandPrefetchStages =
+        integer_divide_ceil(MinMemInFlyBytes / WgpPerCU,
+                            (MPerBlock * sizeof(ADataType) / APackedSize +
+                             NPerBlock * sizeof(BDataType) / BPackedSize) *
+                                KPerBlock);
     static constexpr index_t PrefetchStages =
         FullMemBandPrefetchStages >= 2
             ? FullMemBandPrefetchStages <= 8 ? FullMemBandPrefetchStages : 8
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
index 2d9f95627c..c7115c8eb4 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp
@@ -67,16 +67,22 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeA()
     {
-        constexpr index_t smem_size_a = sizeof(typename Problem::ADataType) *
-                                        MakeALdsBlockDescriptor<Problem>().get_element_space_size();
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<typename Problem::ADataType>>::PackedSize;
+        constexpr index_t smem_size_a =
+            sizeof(typename Problem::ADataType) *
+            MakeALdsBlockDescriptor<Problem>().get_element_space_size() / PackedSize;
         return smem_size_a;
     }
 
     template <typename Problem>
     CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSizeB()
     {
-        constexpr index_t smem_size_b = sizeof(typename Problem::BDataType) *
-                                        MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<typename Problem::BDataType>>::PackedSize;
+        constexpr index_t smem_size_b =
+            sizeof(typename Problem::BDataType) *
+            MakeBLdsBlockDescriptor<Problem>().get_element_space_size() / PackedSize;
         return smem_size_b;
     }
 
@@ -387,8 +393,8 @@ struct GemmPipelineAGmemBGmemCRegV1DefaultPolicy
         using AccDataType     = float;
         using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
+                                                typename Problem::ComputeDataType,
                                                 AccDataType,
                                                 WarpTile::at(I0),
                                                 WarpTile::at(I1),
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
index ce2dc9fb96..fe706113ae 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp
@@ -20,6 +20,11 @@ struct GemmPipelineAGmemBGmemCRegV2
     using CDataType      = remove_cvref_t<typename Problem::CDataType>;
     using BlockGemmShape = remove_cvref_t<typename Problem::BlockGemmShape>;
 
+    static constexpr index_t APackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
+    static constexpr index_t BPackedSize =
+        ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
+
     static constexpr index_t kBlockSize = Problem::kBlockSize;
 
     static constexpr index_t kMPerBlock = BlockGemmShape::kM;
@@ -37,13 +42,15 @@ struct GemmPipelineAGmemBGmemCRegV2
 
     CK_TILE_HOST_DEVICE static constexpr index_t GetStaticLdsSize()
     {
-        return integer_divide_ceil(
-                   sizeof(ADataType) *
-                       Policy::template MakeALdsBlockDescriptor<Problem>().get_element_space_size(),
-                   16) *
+        return integer_divide_ceil(sizeof(ADataType) *
+                                       Policy::template MakeALdsBlockDescriptor<Problem>()
+                                           .get_element_space_size() /
+                                       APackedSize,
+                                   16) *
                    16 +
                sizeof(BDataType) *
-                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size();
+                   Policy::template MakeBLdsBlockDescriptor<Problem>().get_element_space_size() /
+                   BPackedSize;
     }
 
     template <typename ADramBlockWindowTmp,
@@ -75,7 +82,8 @@ struct GemmPipelineAGmemBGmemCRegV2
         auto a_lds_block = make_tensor_view<address_space_enum::lds>(p_a_lds, a_lds_block_desc);
 
         constexpr index_t a_lds_block_space_size_aligned =
-            integer_divide_ceil(sizeof(ADataType) * a_lds_block_desc.get_element_space_size(), 16) *
+            integer_divide_ceil(
+                sizeof(ADataType) * a_lds_block_desc.get_element_space_size() / APackedSize, 16) *
             16;
 
         // B tile in LDS
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
index 771662f566..f833ccc849 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp
@@ -13,14 +13,16 @@ template <typename ADataType_,
           typename BDataType_,
           typename CDataType_,
           typename BlockGemmShape_,
-          typename Traits_>
+          typename Traits_,
+          typename ComputeDataType_ = ADataType_>
 struct GemmPipelineProblemBase
 {
     using Traits = remove_cvref_t<Traits_>;
 
-    using ADataType = remove_cvref_t<ADataType_>;
-    using BDataType = remove_cvref_t<BDataType_>;
-    using CDataType = remove_cvref_t<CDataType_>;
+    using ADataType       = remove_cvref_t<ADataType_>;
+    using BDataType       = remove_cvref_t<BDataType_>;
+    using CDataType       = remove_cvref_t<CDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
 
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
@@ -53,13 +55,15 @@ struct GemmPipelineProblemBase
 
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentA()
     {
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<ADataType>>::PackedSize;
         if constexpr(std::is_same_v<ALayout, ck_tile::tensor_layout::gemm::ColumnMajor>)
         {
             constexpr index_t pixels_per_thread =
                 BlockGemmShape::kM * BlockGemmShape::kK / kBlockSize;
-            return pixels_per_thread < VectorLoadSize / sizeof(ADataType)
+            return pixels_per_thread < PackedSize * VectorLoadSize / sizeof(ADataType)
                        ? pixels_per_thread
-                       : VectorLoadSize / sizeof(ADataType);
+                       : PackedSize * VectorLoadSize / sizeof(ADataType);
         }
         else
         {
@@ -69,17 +73,19 @@ struct GemmPipelineProblemBase
 
     CK_TILE_HOST_DEVICE static constexpr auto GetAlignmentB()
     {
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<BDataType>>::PackedSize;
         if constexpr(std::is_same_v<BLayout, ck_tile::tensor_layout::gemm::RowMajor>)
         {
             constexpr index_t pixels_per_thread =
                 BlockGemmShape::kN * BlockGemmShape::kK / kBlockSize;
-            return pixels_per_thread < VectorLoadSize / sizeof(BDataType)
+            return pixels_per_thread < PackedSize * VectorLoadSize / sizeof(BDataType)
                        ? pixels_per_thread
-                       : VectorLoadSize / sizeof(BDataType);
+                       : PackedSize * VectorLoadSize / sizeof(BDataType);
         }
         else
         {
-            return VectorLoadSize / sizeof(BDataType);
+            return PackedSize * VectorLoadSize / sizeof(BDataType);
         }
     }
 
@@ -143,9 +149,14 @@ template <typename ADataType_,
           typename BDataType_,
           typename CDataType_,
           typename BlockGemmShape_,
-          typename Traits_>
-using GemmPipelineProblem =
-    GemmPipelineProblemBase<ADataType_, BDataType_, CDataType_, BlockGemmShape_, Traits_>;
+          typename Traits_,
+          typename ComputeDataType_ = ADataType_>
+using GemmPipelineProblem = GemmPipelineProblemBase<ADataType_,
+                                                    BDataType_,
+                                                    CDataType_,
+                                                    BlockGemmShape_,
+                                                    Traits_,
+                                                    ComputeDataType_>;
 
 template <typename ADataType_,
           typename BDataType_,
@@ -154,14 +165,16 @@ template <typename ADataType_,
           typename Traits_,
           GemmPipelineScheduler Scheduler_ = GemmPipelineScheduler::Intrawave,
           bool HasHotLoop_                 = true,
-          TailNumber TailNum_              = TailNumber::Full>
+          TailNumber TailNum_              = TailNumber::Full,
+          typename ComputeDataType_        = ADataType_>
 struct UniversalGemmPipelineProblem
 {
     using Traits = remove_cvref_t<Traits_>;
 
-    using ADataType = remove_cvref_t<ADataType_>;
-    using BDataType = remove_cvref_t<BDataType_>;
-    using CDataType = remove_cvref_t<CDataType_>;
+    using ADataType       = remove_cvref_t<ADataType_>;
+    using BDataType       = remove_cvref_t<BDataType_>;
+    using CDataType       = remove_cvref_t<CDataType_>;
+    using ComputeDataType = remove_cvref_t<ComputeDataType_>;
 
     using BlockGemmShape = remove_cvref_t<BlockGemmShape_>;
 
diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
index c20d09cea4..fd1e76a02b 100644
--- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
+++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp
@@ -34,31 +34,41 @@ struct UniversalGemmBasePolicy
         constexpr index_t BlockSize           = Problem::kBlockSize;
         constexpr index_t KPerBlock           = Problem::BlockGemmShape::kK;
         constexpr index_t elements_per_thread = MNPerBlock * KPerBlock / BlockSize;
+        constexpr index_t PackedSize =
+            ck_tile::numeric_traits<remove_cvref_t<DataType>>::PackedSize;
 
         // Assume DataType is even!
-        if constexpr(XPerTile % (16 / sizeof(DataType)) == 0 &&
-                     elements_per_thread % (16 / sizeof(DataType)) == 0)
+        if constexpr(XPerTile % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     elements_per_thread % (PackedSize * 32 / sizeof(DataType)) == 0 &&
+                     PackedSize == 2)
         {
-            return (16 / sizeof(DataType));
+            return (PackedSize * 32 / sizeof(DataType));
         }
-        else if constexpr(XPerTile % (8 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (8 / sizeof(DataType)) == 0)
+        else if constexpr(XPerTile % (PackedSize * 16 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 16 / sizeof(DataType)) == 0)
         {
-            return (8 / sizeof(DataType));
+            return (PackedSize * 16 / sizeof(DataType));
         }
-        else if constexpr(sizeof(DataType) >= 4 && XPerTile % (4 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (4 / sizeof(DataType)) == 0)
+        else if constexpr(XPerTile % (PackedSize * 8 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 8 / sizeof(DataType)) == 0)
         {
-            return (4 / sizeof(DataType));
+            return (PackedSize * 8 / sizeof(DataType));
         }
-        else if constexpr(sizeof(DataType) >= 2 && XPerTile % (2 / sizeof(DataType)) == 0 &&
-                          elements_per_thread % (2 / sizeof(DataType)) == 0)
+        else if constexpr(sizeof(DataType) >= PackedSize * 4 &&
+                          XPerTile % (PackedSize * 4 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 4 / sizeof(DataType)) == 0)
         {
-            return (2 / sizeof(DataType));
+            return (PackedSize * 4 / sizeof(DataType));
+        }
+        else if constexpr(sizeof(DataType) >= PackedSize * 2 &&
+                          XPerTile % (PackedSize * 2 / sizeof(DataType)) == 0 &&
+                          elements_per_thread % (PackedSize * 2 / sizeof(DataType)) == 0)
+        {
+            return (PackedSize * 2 / sizeof(DataType));
         }
         else
         {
-            return 1;
+            return PackedSize;
         }
     }
 
@@ -564,8 +574,8 @@ struct UniversalGemmPipelineAgBgCrPolicy
     {
         using BlockWarps      = typename Problem::BlockGemmShape::BlockWarps;
         using WarpTile        = typename Problem::BlockGemmShape::WarpTile;
-        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ADataType,
-                                                typename Problem::BDataType,
+        using WarpGemm        = WarpGemmMfmaDispatcher<typename Problem::ComputeDataType,
+                                                typename Problem::ComputeDataType,
                                                 typename Problem::CDataType,
                                                 WarpTile::at(I0),
                                                 WarpTile::at(I1),
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp
index ac392f0906..0417cfe2a2 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_fpAintB_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -73,39 +73,9 @@ struct ReferencefpAintBGemm : public device::BaseOperator
                     ScaleDataType v_scale;
                     ADataType v_converted_b;
 
-                    // use PassThrough instead of ConvertBF16RTN for reference calculation
-                    if constexpr(is_same_v<AElementwiseOperation,
-                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
-                    {
-                        ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k));
-                    }
-                    else
-                    {
-                        arg.a_element_op_(v_a, arg.a_m_k_(m, k));
-                    }
-
-                    // same for B matrix
-                    if constexpr(is_same_v<BElementwiseOperation,
-                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
-                    {
-                        ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n));
-                    }
-                    else
-                    {
-                        arg.b_element_op_(v_b, arg.b_k_n_(k, n));
-                    }
-
-                    // same for scale matrix
-                    if constexpr(is_same_v<BElementwiseOperation,
-                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
-                    {
-                        ck::tensor_operation::element_wise::PassThrough{}(v_scale,
-                                                                          arg.scale_k_n_(k, n));
-                    }
-                    else
-                    {
-                        arg.b_element_op_(v_scale, arg.scale_k_n_(k, n));
-                    }
+                    arg.a_element_op_(v_a, arg.a_m_k_(m, k));
+                    arg.b_element_op_(v_b, arg.b_k_n_(k, n));
+                    arg.b_element_op_(v_scale, arg.scale_k_n_(k, n));
 
                     v_converted_b = type_convert<ADataType>(v_b) * v_scale;
                     v_acc += ck::type_convert<AccDataType>(v_a) *
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
index 1ae11fe9db..7e2482807d 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -68,21 +68,32 @@ struct ReferenceGemm : public device::BaseOperator
 
                 for(int k = 0; k < K; ++k)
                 {
-                    // use PassThrough instead of ConvertBF16RTN for reference calculation
-                    if constexpr(is_same_v<AElementwiseOperation,
-                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+                    if constexpr(is_same_v<ADataType, pk_i4_t>)
                     {
-                        ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k));
+                        uint8_t i4x2 = arg.a_m_k_(m, k).data;
+                        int8_t i4    = 0;
+                        if(k % 2 == 1)
+                            i4 = (i4x2 >> 0) & 0xf;
+                        else
+                            i4 = (i4x2 >> 4) & 0xf;
+                        i4  = i4 - 8;
+                        v_a = type_convert<ComputeTypeA>(i4);
                     }
                     else
                     {
                         arg.a_element_op_(v_a, arg.a_m_k_(m, k));
                     }
-                    // same for B matrix
-                    if constexpr(is_same_v<BElementwiseOperation,
-                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
+
+                    if constexpr(is_same_v<BDataType, pk_i4_t>)
                     {
-                        ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n));
+                        uint8_t i4x2 = arg.b_k_n_(k, n).data;
+                        int8_t i4    = 0;
+                        if(k % 2 == 1)
+                            i4 = (i4x2 >> 0) & 0xf;
+                        else
+                            i4 = (i4x2 >> 4) & 0xf;
+                        i4  = i4 - 8;
+                        v_b = type_convert<ComputeTypeB>(i4);
                     }
                     else
                     {
diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp
index 06b2285229..7f263db707 100644
--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_gemm_multiple_d.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -74,26 +74,8 @@ struct ReferenceGemmMultipleD : public device::BaseOperator
 
                 for(int k = 0; k < K; ++k)
                 {
-                    // use PassThrough instead of ConvertBF16RTN for reference calculation
-                    if constexpr(is_same_v<AElementwiseOperation,
-                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
-                    {
-                        ck::tensor_operation::element_wise::PassThrough{}(v_a, arg.a_m_k_(m, k));
-                    }
-                    else
-                    {
-                        arg.a_element_op_(v_a, arg.a_m_k_(m, k));
-                    }
-                    // same for B matrix
-                    if constexpr(is_same_v<BElementwiseOperation,
-                                           ck::tensor_operation::element_wise::ConvertBF16RTN>)
-                    {
-                        ck::tensor_operation::element_wise::PassThrough{}(v_b, arg.b_k_n_(k, n));
-                    }
-                    else
-                    {
-                        arg.b_element_op_(v_b, arg.b_k_n_(k, n));
-                    }
+                    arg.a_element_op_(v_a, arg.a_m_k_(m, k));
+                    arg.b_element_op_(v_b, arg.b_k_n_(k, n));
 
                     v_acc +=
                         ck::type_convert<AccDataType>(v_a) * ck::type_convert<AccDataType>(v_b);
diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
index 4358953a58..4a44c425aa 100644
--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp
@@ -22,6 +22,7 @@ using I8   = int8_t;
 using I32  = int32_t;
 using F8   = ck::f8_t;
 using BF8  = ck::bf8_t;
+using I4   = ck::pk_i4_t;
 
 using Empty_Tuple = ck::Tuple<>;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
index 3b3baf6978..2dc2061015 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm.hpp
@@ -15,6 +15,9 @@
 #ifdef DL_KERNELS
 #include "gemm_dl.inc"
 #endif
+#ifdef DPP_KERNELS
+#include "gemm_dpp.inc"
+#endif
 #ifdef CK_USE_WMMA
 #include "gemm_wmma.inc"
 #endif
@@ -92,32 +95,24 @@ struct DeviceOperationInstanceFactory<
             {
                 add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
-                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
-                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_dl_f16_f16_f16_km_kn_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(op_ptrs);
-                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
             }
             else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
                               is_same_v<CLayout, Row>)
             {
                 add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(op_ptrs);
                 add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
-                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(op_ptrs);
-                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
             }
         }
 #endif
@@ -153,6 +148,39 @@ struct DeviceOperationInstanceFactory<
 #endif
 #endif // DL_KERNELS
 
+#ifdef DPP_KERNELS
+#ifdef CK_ENABLE_FP16
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_mk_nk_mn_irregular_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(op_ptrs);
+                add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(op_ptrs);
+            }
+        }
+#endif
+#endif // DPP_KERNELS
+
 #ifdef CK_USE_WMMA
 #ifdef CK_ENABLE_FP16
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc b/library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc
index 44a11f6284..0fee4190a6 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_dl.inc
@@ -28,16 +28,6 @@ void add_device_gemm_dl_f16_f16_f16_km_kn_mn_irregular_instances(
         DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
-void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_dpp_f16_f16_f16_km_kn_mn_irregular_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
 void add_device_gemm_dl_f16_f16_f16_km_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -48,16 +38,6 @@ void add_device_gemm_dl_f16_f16_f16_km_nk_mn_irregular_instances(
         DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
-void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_dpp_f16_f16_f16_km_nk_mn_irregular_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Col, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
 void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -68,16 +48,6 @@ void add_device_gemm_dl_f16_f16_f16_mk_kn_mn_irregular_instances(
         DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
-void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
-void add_device_gemm_dpp_f16_f16_f16_mk_kn_mn_irregular_instances(
-    std::vector<std::unique_ptr<
-        DeviceGemm<Row, Row, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
-        instances);
-
 void add_device_gemm_dl_f16_f16_f16_mk_nk_mn_instances(
     std::vector<std::unique_ptr<
         DeviceGemm<Row, Col, Row, F16, F16, F16, PassThrough, PassThrough, PassThrough>>>&
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
index c76b0fdfe1..58f8ed3c3c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply.hpp
@@ -174,6 +174,86 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_mem_v2_kpadding_i
                                                           PassThrough,
                                                           MultiplyMultiply>>>& instances);
 #endif
+#ifdef CK_ENABLE_FP16
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+
+void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitK<Row,
+                                                          Col,
+                                                          Tuple<Row, Col>,
+                                                          Row,
+                                                          F8,
+                                                          F8,
+                                                          Tuple<F32, F32>,
+                                                          F16,
+                                                          PassThrough,
+                                                          PassThrough,
+                                                          MultiplyMultiply>>>& instances);
+#endif
+#endif
 
 #ifdef CK_ENABLE_FP16
 void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_comp_default_instances_part1(
@@ -332,7 +412,6 @@ void add_device_gemm_multiply_multiply_xdl_f8_f8_f16_mk_nk_mn_mem_v2_kpadding_in
                                                           PassThrough,
                                                           MultiplyMultiply>>>& instances);
 #endif
-#endif
 
 #if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_INT8))
 void add_device_gemm_multiply_multiply_xdl_i8_i8_f16_mk_nk_mn_comp_default_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle.hpp
index 04ff9fdb88..90da3ad0fe 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle.hpp
@@ -60,47 +60,61 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_m
                                                                      MultiplyMultiply>>>&
         instances);
 
-#if 0
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_padding_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                          Col,
-                                                          Tuple<Row, Col>,
-                                                          Row,
-                                                          F8,
-                                                          F8,
-                                                          Tuple<F32, F32>,
-                                                          F16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          MultiplyMultiply>>>& instances);
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
 
-
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_padding_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                          Col,
-                                                          Tuple<Row, Col>,
-                                                          Row,
-                                                          F8,
-                                                          F8,
-                                                          Tuple<F32, F32>,
-                                                          F16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          MultiplyMultiply>>>& instances);
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
 
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_padding_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
-                                                          Col,
-                                                          Tuple<Row, Col>,
-                                                          Row,
-                                                          F8,
-                                                          F8,
-                                                          Tuple<F32, F32>,
-                                                          F16,
-                                                          PassThrough,
-                                                          PassThrough,
-                                                          MultiplyMultiply>>>& instances);
-#endif
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
 #endif
 
 #if(defined(CK_ENABLE_BF16) || defined(CK_ENABLE_FP8))
@@ -145,8 +159,8 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_
                                                                      PassThrough,
                                                                      MultiplyMultiply>>>&
         instances);
-#if 0
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_padding_instances(
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
                                                                      Tuple<Row, Col>,
@@ -160,7 +174,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_
                                                                      MultiplyMultiply>>>&
         instances);
 
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_padding_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
                                                                      Tuple<Row, Col>,
@@ -174,7 +188,21 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_
                                                                      MultiplyMultiply>>>&
         instances);
 
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_padding_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>&
+        instances);
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
                                                                      Tuple<Row, Col>,
@@ -187,7 +215,6 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_
                                                                      PassThrough,
                                                                      MultiplyMultiply>>>&
         instances);
-#endif
 #endif
 
 template <typename ADataType,
@@ -240,14 +267,16 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
                 add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances(
                     op_ptrs);
-#if 0
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_padding_instances(
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
                     op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_padding_instances(
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instances_v2(
                     op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_padding_instances(
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instances_v2(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instances(
                     op_ptrs);
-#endif
             }
         }
 #endif
@@ -265,14 +294,16 @@ struct DeviceOperationInstanceFactory<
                     op_ptrs);
                 add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances(
                     op_ptrs);
-#if 0
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_padding_instances(
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instances_v2(
                     op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_padding_instances(
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instances_v2(
                     op_ptrs);
-                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_padding_instances(
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instances_v2(
+                    op_ptrs);
+
+                add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances(
                     op_ptrs);
-#endif
             }
         }
 #endif
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
index 52046a1070..4218c51ca3 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal.hpp
@@ -166,11 +166,22 @@ void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v1_kpadding_instances
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
+
 void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
         instances);
 
+void add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, F16, I4, F16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
+void add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<
+        DeviceGemmV2<Row, Col, Row, BF16, I4, BF16, PassThrough, PassThrough, PassThrough>>>&
+        instances);
+
 void add_device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instances(
     std::vector<std::unique_ptr<
         DeviceGemmV2<Row, Col, Row, F16, F8, F16, PassThrough, PassThrough, PassThrough>>>&
@@ -810,6 +821,28 @@ struct DeviceOperationInstanceFactory<
             }
         }
 #endif
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, pk_i4_t> &&
+                     is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instances(op_ptrs);
+            }
+        }
+
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, pk_i4_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+            }
+        }
+
         return op_ptrs;
     }
 };
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
index f44c025177..18203e7d5c 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_universal_streamk.hpp
@@ -238,6 +238,403 @@ void add_device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_v2_mnkpaddin
                                                       PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Row,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Row,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+
+void add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances(
+    std::vector<std::unique_ptr<DeviceGemm_Streamk_V2<Col,
+                                                      Col,
+                                                      Row,
+                                                      BF16,
+                                                      BF16,
+                                                      BF16,
+                                                      PassThrough,
+                                                      PassThrough,
+                                                      PassThrough>>>& instances);
+#endif
+
 #if(defined(CK_ENABLE_FP8))
 void add_device_gemm_xdl_universal_streamk_f16_f8_f16_mk_kn_mn_comp_default_instances(
     std::vector<std::unique_ptr<
@@ -527,6 +924,109 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemm_S
         }
 #endif
 
+#ifdef CK_ENABLE_BF16
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<CDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instances(
+                    op_ptrs);
+            }
+            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
+                              is_same_v<CLayout, Row>)
+            {
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instances(
+                    op_ptrs);
+
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instances(
+                    op_ptrs);
+                add_device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instances(
+                    op_ptrs);
+            }
+        }
+#endif
+
 #if(defined(CK_ENABLE_FP8))
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, f8_t> &&
                      is_same_v<CDataType, half_t>)
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
index d82f82cce2..3ebfd0bb7d 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_two_stage_xdl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -41,11 +41,13 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_generic_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
         // clang-format on
         >;
@@ -58,11 +60,13 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
@@ -82,13 +86,37 @@ template <ck::index_t NDimSpatial,
           ConvolutionBackwardWeightSpecialization ConvSpec,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
-using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances =
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_f16_irregular_instances =
     std::tuple<
         // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    48,    64,     32,   8,   16,   16,    3,    4,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              3,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    48,     32,   8,   16,   16,    4,    3,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              3,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    80,     32,   8,   16,   16,    4,    5,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              5,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,   112,     32,   8,   16,   16,    4,    7,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              7,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,   208,     32,   8,   16,   16,    4,   13,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,             13,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
+        // clang-format on
+        >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_generic_instances =
+    std::tuple<
+// clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
         // clang-format on
         >;
@@ -101,11 +129,13 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
 
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2>,
@@ -125,13 +155,37 @@ template <ck::index_t NDimSpatial,
           ConvolutionBackwardWeightSpecialization ConvSpec,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
-using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances =
+using device_grouped_conv_bwd_weight_two_stage_nhwgc_xdl_c_shuffle_bf16_irregular_instances =
     std::tuple<
         // clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     BF16,     BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    48,    64,     32,   8,   16,   16,    3,    4,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              3,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     BF16,     BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    48,     32,   8,   16,   16,    4,    3,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              3,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     BF16,     BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    80,     32,   8,   16,   16,    4,    5,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              5,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     BF16,     BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,   112,     32,   8,   16,   16,    4,    7,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              7,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>,
+        DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     BF16,     BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,   208,     32,   8,   16,   16,    4,   13,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,             13,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1>
+        // clang-format on
+        >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_generic_instances =
+    std::tuple<
+// clang-format off
+        //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
+        //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
+        //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
+        //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>
         // clang-format on
         >;
@@ -145,11 +199,13 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, F16, F16, 1, 1>,
     
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, F16, F16, 2, 2>,
@@ -185,11 +241,13 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_generic_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>
         // clang-format on
         >;
@@ -202,11 +260,13 @@ template <ck::index_t NDimSpatial,
           BlockGemmPipelineScheduler Scheduler,
           BlockGemmPipelineVersion PipelineVersion>
 using device_grouped_conv_bwd_weight_two_stage_ngchw_xdl_c_shuffle_bf16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################################|              Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm| NumGroups|
         //#########################################|              Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|   ToMerge|
         //#########################################|          Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|          |
         //#########################################|                 |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 1, BF16, BF16, 1, 1>,
     
         DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,    F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  1, Scheduler, PipelineVersion, 2, BF16, BF16, 2, 2>,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp
new file mode 100644
index 0000000000..b445e0001d
--- /dev/null
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp
@@ -0,0 +1,112 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+using namespace ck::tensor_layout::convolution;
+
+using BF16 = ck::bhalf_t;
+using F16  = ck::half_t;
+using F32  = float;
+
+#ifdef CK_ENABLE_FP8
+using F8 = ck::f8_t;
+#endif
+
+#ifdef CK_ENABLE_BF8
+using BF8 = ck::bf8_t;
+#endif
+
+using Empty_Tuple = ck::Tuple<>;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+static constexpr auto ConvBwdWeightDefault =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Default;
+
+static constexpr auto ConvBwdWeightFilter1x1Stride1Pad0 =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |
+        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    16,    16,     32,   8,   16,   16,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              1,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              1,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |
+        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              2,              2,      false,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>, S<2, 0, 1>,  S<1, 0, 2>,                   1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,       
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    80,     32,   8,   16,   16,    4,    5,  S<4, 16, 1>, S<2, 0, 1>,  S<2, 0, 1>,                   1,              4,              4,      false,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              5,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,   112,     32,   8,   16,   16,    4,    7,  S<4, 16, 1>, S<2, 0, 1>,  S<2, 0, 1>,                   1,              4,              4,      false,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                1,              7,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          BlockGemmPipelineScheduler Scheduler,
+          BlockGemmPipelineVersion PipelineVersion>
+using device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances = std::tuple<
+    // clang-format off
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| BlockGemm| BlockGemm|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|  Pipeline|  Pipeline|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl| Scheduler|   Version|
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |          |          |
+        // generic instance
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    32,     32,   8,   32,   32,    1,    1,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              2,              2,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              2,              2,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     32,   8,   32,   32,    1,    2,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,   128,     32,   8,   32,   32,    1,    4,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 16,  1>, S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     32,   8,   32,   32,    2,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              4,              4,      false,  S<4, 8,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              4,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,   128,    32,     32,   8,   32,   32,    4,    1,  S<4, 16, 1>,  S<2, 0, 1>,  S<1, 0, 2>,                  1,              8,              8,      false,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,                1,              8,              8,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    80,     32,   8,   16,   16,    4,    5,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,               1,              5,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffleV3< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,   112,     32,   8,   16,   16,    4,    7,  S<4, 16, 1>,  S<2, 0, 1>,  S<2, 0, 1>,                  1,              4,              4,      false,  S<4, 16,  1>,  S<2, 0, 1>,  S<2, 0, 1>,               1,              7,              4,      false,           1,           1,   S<1, 8, 1, 8>,                  2, Scheduler, PipelineVersion>
+    //clang-format on
+    >;
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
index a08d73546d..a493719637 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -45,31 +45,49 @@ template <ck::index_t NDimSpatial,
           typename BLayout,
           typename ELayout,
           ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances = std::tuple<
+    // clang-format off
+  //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+  //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+  //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+  //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+  // generic instance
+  DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          index_t MaxTransposeTransferSrcScalarPerVector = 1,
+          index_t MaxTransposeTransferDstScalarPerVector = 1>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances = std::tuple<
     // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| Compute| Compute|    MaxTranspose|    MaxTranspose|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|   TypeA|   TypeB|     TransferSrc|     TransferDst|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|        |        | ScalarPerVector| ScalarPerVector|
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |        |        |                |                |
         // generic instance
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
         // instances for small conv.K and conv.C
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
 
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 64, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  4,   32,   32,    2,    4,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 64, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  4,   32,   32,    4,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  4,   32,   32,    2,    2,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 16, 4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 32, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  4>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              1,      true,  S<1, 4, 32, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  4,   32,   32,    2,    1,  S<1, 4, 16, 1>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F32,     F32,     F32,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  4,   32,   32,    1,    2,  S<1, 4, 8,  2>,   S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              4,              2,      true,  S<1, 4, 16, 1>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              4,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4, F32, F32, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>
     // clang-format on
     >;
 
@@ -78,33 +96,51 @@ template <ck::index_t NDimSpatial,
           typename BLayout,
           typename ELayout,
           ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_generic_instances = std::tuple<
+    // clang-format off
+  //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+  //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+  //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+  //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+  // generic instance
+  DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          index_t MaxTransposeTransferSrcScalarPerVector = 1,
+          index_t MaxTransposeTransferDstScalarPerVector = 1>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances = std::tuple<
     // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| Compute| Compute|    MaxTranspose|    MaxTranspose|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|   TypeA|   TypeB|     TransferSrc|     TransferDst|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|        |        | ScalarPerVector| ScalarPerVector|
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |        |        |                |                |
         // generic instance
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
         // instance for small conv.K
         // for fp16 conv.K and conv.C must be divisible by 2
         // since half_t atomic_add require scalar_per_x_vector % 2 == 0
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2>,  
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,  
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
 
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8, F16, F16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>
     // clang-format on
     >;
 
@@ -113,31 +149,49 @@ template <ck::index_t NDimSpatial,
           typename BLayout,
           typename ELayout,
           ConvolutionBackwardWeightSpecialization ConvSpec>
+using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_generic_instances = std::tuple<
+    // clang-format off
+  //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|               ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+  //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                     Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+  //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|             Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+  //#########################################|        |         |          |          |       |        |        |        |            |            |            |                           |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+  // generic instance
+  DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>
+    // clang-format on
+    >;
+
+template <ck::index_t NDimSpatial,
+          typename ALayout,
+          typename BLayout,
+          typename ELayout,
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          index_t MaxTransposeTransferSrcScalarPerVector = 1,
+          index_t MaxTransposeTransferDstScalarPerVector = 1>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances = std::tuple<
     // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|               ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                     Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|             Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                           |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| Compute| Compute|    MaxTranspose|    MaxTranspose|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|   TypeA|   TypeB|     TransferSrc|     TransferDst|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|        |        | ScalarPerVector| ScalarPerVector|
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |        |        |                |                |
         // generic instance
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              4,      true,           1,           1,   S<1, 16, 1, 4>,               1, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
         // instance for small conv.K
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1>, 
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              1,              1,      true,           1,           1,   S<1, 32, 1, 4>,               1, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>, 
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              1,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
 
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4>
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 32, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  8>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              1,      true,   S<1, 4, 16, 2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              4,      true,   S<1, 4, 4,  4>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,     F32,    BF16,     F32, PassThrough, PassThrough, PassThrough,                   ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>, S<0, 3, 1, 2>,   S<0, 2, 1, 3>,              2,              8,              2,      true,   S<1, 4, 8,  2>,   S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               4, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>
     // clang-format on
     >;
 
@@ -145,34 +199,36 @@ template <ck::index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
           typename ELayout,
-          ConvolutionBackwardWeightSpecialization ConvSpec>
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          index_t MaxTransposeTransferSrcScalarPerVector = 1,
+          index_t MaxTransposeTransferDstScalarPerVector = 1>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances = std::tuple<
     // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| Compute| Compute|    MaxTranspose|    MaxTranspose|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|   TypeA|   TypeB|     TransferSrc|     TransferDst|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|        |        | ScalarPerVector| ScalarPerVector|
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |        |        |                |                |
         // generic instance
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
         // instance for small conv.K
         // for bf16 conv.K and conv.C must be divisible by 2
         // since half_t atomic_add require scalar_per_x_vector % 2 == 0
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2>,  
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,  
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
 
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8>
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,   BF16,    BF16,    BF16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8, BF16, BF16, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>
     // clang-format on
     >;
 
@@ -180,35 +236,37 @@ template <ck::index_t NDimSpatial,
           typename ALayout,
           typename BLayout,
           typename ELayout,
-          ConvolutionBackwardWeightSpecialization ConvSpec>
+          ConvolutionBackwardWeightSpecialization ConvSpec,
+          index_t MaxTransposeTransferSrcScalarPerVector = 1,
+          index_t MaxTransposeTransferDstScalarPerVector = 1>
 using device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_comp_bf8_f8_instances = std::tuple<
 // clang-format off
-        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| Compute| Compute|
-        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|   TypeA|   TypeB|
-        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|        |        |
-        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |        |        |
+        //#########################################|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|              ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer| Compute| Compute|    MaxTranspose|    MaxTranspose|
+        //#########################################|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                    Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|   TypeA|   TypeB|     TransferSrc|     TransferDst|
+        //#########################################| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|            Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|        |        | ScalarPerVector| ScalarPerVector|
+        //#########################################|        |         |          |          |       |        |        |        |            |            |            |                          |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |        |        |                |                |
 #if defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
         // generic instance
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2,       BF8,     F8>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              4,      true,           1,           1,   S<1, 16, 1, 4>,               2,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
         // instance for small conv.K
         // for fp16 conv.K and conv.C must be divisible by 2
         // since half_t atomic_add require scalar_per_x_vector % 2 == 0
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2,       BF8,     F8>,  
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              2,              1,      true,           1,           1,   S<1, 32, 1, 4>,               2,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,  
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              2,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
 
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>,   
-        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8>
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 32, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 8>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    64,   128,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    64,     4,  8,   32,   32,    2,    2,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,   128,    64,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   256,    64,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 8,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,   128,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              1,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  8>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              1,      true,  S<1, 4, 16, 2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    64,    32,     4,  8,   32,   32,    2,    1,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              4,      true,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              2,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>,   
+        DeviceGroupedConvBwdWeight_Xdl_CShuffle< NDimSpatial,  ALayout,   BLayout,   ELayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,    64,    32,    64,     4,  8,   32,   32,    1,    2,  S<1, 4, 4,  4>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,              2,              8,              2,      true,  S<1, 4, 8,  2>,  S<0, 3, 1, 2>,  S<0, 2, 1, 3>,             2,              8,              4,      true,           1,           1,   S<1, 16, 1, 4>,               8,       BF8,     F8, MaxTransposeTransferSrcScalarPerVector, MaxTransposeTransferDstScalarPerVector>
 #endif
     // clang-format on
     >;
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
index dc4ee534b9..f9b3b43795 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp
@@ -56,11 +56,14 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_bf16_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+#else // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -79,7 +82,7 @@ using device_grouped_conv_fwd_xdl_bf16_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,    64,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,   128,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,    64,    64,    64,   8,   8,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>
-
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
     // clang-format on
     >;
 
@@ -90,11 +93,14 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_f16_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+#else // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -109,6 +115,7 @@ using device_grouped_conv_fwd_xdl_f16_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
     // clang-format on
     >;
 
@@ -138,11 +145,14 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_int8_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,DsLayout,ELayout,int8_t,int8_t,int32_t,   int8_t,    DsLayout,int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,             256,   128,   128,   128,  32,  32,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,              16,        1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,         1,            1,           1,              S<1, 64, 1, 4>,              16,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+#else // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    32,   8,   8,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
@@ -153,6 +163,7 @@ using device_grouped_conv_fwd_xdl_int8_comp_instances = std::tuple<
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   256,   128,    32,   8,   8,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle_V3<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   int8_t,   int8_t,     int32_t,      int8_t,    DsLayout,   int8_t, PassThrough, PassThrough, PassThrough,       ConvSpec, GemmMNKPadding,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
index 242ad2f730..9114d5c1fb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_merged_groups_instance.hpp
@@ -40,15 +40,21 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_merged_groups_bf16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| ACompute| BCompute| BlockGemm| NumGroups|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|     Type|     Type|  Pipeline|   ToMerge|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|         |         | Scheduler|          |
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |         |         |          |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
+#else // defined(CK_USE_AMD_MFMA_GFX950)
         // Instances with NumGroupsPerBatch > 1
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 16>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,  BF16,  BF16,     F32,     BF16,    DsLayout,  BF16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, BF16, BF16, LoopScheduler::Default, 32>
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
     // clang-format on
     >;
 
@@ -59,15 +65,22 @@ template <index_t NDimSpatial,
           typename ELayout,
           ConvolutionForwardSpecialization ConvSpec>
 using device_grouped_conv_fwd_xdl_merged_groups_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //########################################|     NumDim|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|          Ds| EData|           A|           B|         CDE|    ConvForward|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //########################################|    Spatial| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|    DataType|  Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //########################################|           |       |       |            |       |      |      |        |         |            |      |            |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        // Instances with NumGroupsPerBatch > 1
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
+        DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     32,   8, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
+#else // defined(CK_USE_AMD_MFMA_GFX950)
         // Instances with NumGroupsPerBatch > 1
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 8>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 16>,
         DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<NDimSpatial,ALayout,BLayout,    DsLayout,ELayout,   F16,   F16,     F32,      F16,    DsLayout,   F16, PassThrough, PassThrough, PassThrough,                  ConvSpec, GemmMNKPadding,  1,  64,    64,    16,     16,   4, 4,  16,   16,    4,    1,  S< 4, 16,  1>, S<0, 2, 1>,     S<0, 2, 1>,                   1,              4,              4,      1,  S< 4, 16,  1>,   S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              4,      1,           1,           1,   S<1, 16, 1, 4>,                  1, F16, F16, LoopScheduler::Default, 32>
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
     // clang-format on
     >;
 
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
index 33027322e5..f1993eb149 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -311,6 +311,11 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                         op_ptrs);
+
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP16
@@ -320,6 +325,11 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                         op_ptrs);
+
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -343,6 +353,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                         op_ptrs);
+
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP16
@@ -352,12 +371,26 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                         op_ptrs);
+
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instances(
+                        op_ptrs);
+
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -377,12 +410,26 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                         op_ptrs);
+
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instances(
+                        op_ptrs);
+
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instances(
+                        op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instances(
+                        op_ptrs);
                 }
 #endif
             }
@@ -400,6 +447,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev5_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -415,6 +464,17 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         op_ptrs);
                     add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev5_instances(
                         op_ptrs);
+                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
+                             is_same_v<ComputeTypeB, float>)
+                {
+                    add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
+                        op_ptrs);
                 }
 #endif
             }
@@ -463,6 +523,15 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
                         op_ptrs);
+
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_FP16
@@ -472,12 +541,26 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances(
                         op_ptrs);
+
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instances(
+                        op_ptrs);
+
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -497,12 +580,26 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                 {
                     add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instances(
                         op_ptrs);
+
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instances(
+                        op_ptrs);
+
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instances(
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instances(
+                        op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instances(
+                        op_ptrs);
                 }
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
@@ -529,6 +626,8 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances(
+                        op_ptrs);
                 }
 #endif
 #ifdef CK_ENABLE_BF16
@@ -544,6 +643,17 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                         op_ptrs);
                     add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instances(
                         op_ptrs);
+                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instances(
+                        op_ptrs);
+                }
+#endif
+#ifdef CK_ENABLE_FP32
+                if constexpr(is_same_v<InDataType, float> && is_same_v<WeiDataType, float> &&
+                             is_same_v<OutDataType, float> && is_same_v<ComputeTypeA, float> &&
+                             is_same_v<ComputeTypeB, float>)
+                {
+                    add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances(
+                        op_ptrs);
                 }
 #endif
             }
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
index 630eb81357..31a536c7bb 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_weight_xdl.inc
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -74,6 +74,30 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
@@ -87,6 +111,30 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_BF16
 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
@@ -101,6 +149,18 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instances(
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
@@ -112,6 +172,53 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_in
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
@@ -149,6 +256,30 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_p
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
@@ -198,6 +329,66 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_instances(
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NHWGC,
@@ -234,6 +425,30 @@ void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pi
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
                                                            NGCHW,
@@ -282,6 +497,66 @@ void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instances(
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 // conv3d backward weight
 #ifdef CK_ENABLE_BF16
@@ -336,6 +611,66 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
@@ -384,6 +719,30 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf1
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
@@ -433,6 +792,66 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_instances
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
@@ -469,6 +888,30 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16
                                                            PassThrough,
                                                            PassThrough>>>& instances);
 
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NGCDHW,
@@ -506,6 +949,18 @@ void add_device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16
                                                            PassThrough>>>& instances);
 #endif
 #ifdef CK_ENABLE_FP32
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
                                                            NDHWGC,
@@ -517,6 +972,54 @@ void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instances
                                                            PassThrough,
                                                            PassThrough,
                                                            PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
+
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances);
 #endif
 #if defined CK_ENABLE_FP16 && defined CK_ENABLE_FP8 && defined CK_ENABLE_BF8
 void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_comp_bf8_f8_instances(
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
index 8090b2449a..01415c2ddd 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp
@@ -304,7 +304,23 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
                     op_ptrs);
             }
 #endif
-
+#ifdef CK_ENABLE_BF16
+            if constexpr(is_same_v<InDataType, ck::bhalf_t> &&
+                         is_same_v<WeiDataType, ck::bhalf_t> &&
+                         is_same_v<OutDataType, ck::bhalf_t> &&
+                         is_same_v<AComputeType, ck::bhalf_t> &&
+                         is_same_v<BComputeType, ck::bhalf_t>)
+            {
+                add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances(
+                    op_ptrs);
+                add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances(
+                    op_ptrs);
+            }
+#endif
 #ifdef CK_ENABLE_INT8
             if constexpr(is_same_v<InDataType, int8_t> && is_same_v<WeiDataType, int8_t> &&
                          is_same_v<OutDataType, int8_t> && is_same_v<AComputeType, int8_t> &&
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
index e47a876e1b..9a83e36b99 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_comp_xdl.inc
@@ -90,6 +90,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
index f74622ad42..662fadadcf 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_inter_xdl.inc
@@ -90,6 +90,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instances
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
index 81737b6142..f283fe8550 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_mem_intra_xdl.inc
@@ -90,6 +90,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instances
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
index 4cb2aae092..c977c89c94 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl.inc
@@ -204,6 +204,22 @@ void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instances(
                                                                 PassThrough>>>& instances);
 #endif
 
+#ifdef CK_ENABLE_BF16
+void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
+#endif
+
 #ifdef CK_ENABLE_FP32
 void add_device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instances(
     std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
index 1bd2697b97..a81e1e07ba 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_xdl_merged_groups.inc
@@ -23,6 +23,20 @@ void add_device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_bf16_inst
                                                                 PassThrough,
                                                                 PassThrough,
                                                                 PassThrough>>>& instances);
+
+void add_device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<2,
+                                                                NGCHW,
+                                                                GKYXC,
+                                                                Empty_Tuple,
+                                                                NGKHW,
+                                                                BF16,
+                                                                BF16,
+                                                                Empty_Tuple,
+                                                                BF16,
+                                                                PassThrough,
+                                                                PassThrough,
+                                                                PassThrough>>>& instances);
 #endif
 
 #ifdef CK_ENABLE_FP16
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp
index a90fe14603..850183c513 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_gemm_fixed_nk.hpp
@@ -126,6 +126,35 @@ void add_device_grouped_gemm_xdl_fixed_nk_bf16_i8_bf16_mk_nk_mn_instances(
                                                          PassThrough>>>& instances);
 #endif
 
+// bf16_inputA bf16_inputB
+#if defined(CK_ENABLE_BF16)
+void add_device_grouped_gemm_xdl_fixed_nk_bf16_bf16_bf16_mk_kn_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
+                                                         Row,
+                                                         Empty_Tuple,
+                                                         Row,
+                                                         BF16,
+                                                         BF16,
+                                                         Empty_Tuple,
+                                                         BF16,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         PassThrough>>>& instances);
+
+void add_device_grouped_gemm_xdl_fixed_nk_bf16_bf16_bf16_mk_nk_mn_instances(
+    std::vector<std::unique_ptr<DeviceGroupedGemmFixedNK<Row,
+                                                         Col,
+                                                         Empty_Tuple,
+                                                         Row,
+                                                         BF16,
+                                                         BF16,
+                                                         Empty_Tuple,
+                                                         BF16,
+                                                         PassThrough,
+                                                         PassThrough,
+                                                         PassThrough>>>& instances);
+#endif // CK_ENABLE_BF16
+
 template <typename ALayout,
           typename BLayout,
           typename ELayout,
@@ -227,6 +256,24 @@ struct DeviceOperationInstanceFactory<
         }
 #endif
 
+// bf16_inputA bf16_inputB
+#if defined(CK_ENABLE_BF16)
+        if constexpr(is_same_v<ADataType, bhalf_t> && is_same_v<BDataType, bhalf_t> &&
+                     is_same_v<EDataType, bhalf_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_bf16_bf16_bf16_mk_kn_mn_instances(op_ptrs);
+            }
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
+                         is_same_v<ELayout, Row>)
+            {
+                add_device_grouped_gemm_xdl_fixed_nk_bf16_bf16_bf16_mk_nk_mn_instances(op_ptrs);
+            }
+        }
+#endif // CK_ENABLE_BF16
+
         return op_ptrs;
     }
 };
diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
old mode 100644
new mode 100755
index dd023e6b51..5b88d5f25c
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -39,6 +39,13 @@ function(add_instance_library INSTANCE_NAME)
 
     set(INST_TARGETS ${SUPPORTED_GPU_TARGETS})
 
+    # Do not build DPP instances if DPP_KERNELS macro is not set
+    foreach(source IN LISTS ARGN)
+        if(NOT DEFINED DPP_KERNELS AND source MATCHES "_dpp")
+            message("removing dpp instance ${source} ")
+            list(REMOVE_ITEM ARGN "${source}")
+        endif()
+    endforeach()
     # Do not build DL instances if DL_KERNELS macro is not set
     foreach(source IN LISTS ARGN)
         if(NOT DEFINED DL_KERNELS AND source MATCHES "_dl")
@@ -62,7 +69,7 @@ function(add_instance_library INSTANCE_NAME)
     endforeach()
     # Do not build mha instances if gfx94 or gfx90a targets are not on the target list
     foreach(source IN LISTS ARGN)
-	    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND source MATCHES "mha")
+	    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx90a" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "mha")
          message("removing mha instance ${source} ")
          list(REMOVE_ITEM ARGN "${source}")
     endif()
@@ -70,25 +77,25 @@ function(add_instance_library INSTANCE_NAME)
     # Do not build gemm_universal_f8 or gemm_multiply_multiply_f8 for any targets except gfx94
     if(NOT CK_USE_FP8_ON_UNSUPPORTED_ARCH)
     foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_multiply_multiply_xdl_f8")
+    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_multiply_multiply_xdl_f8")
          message("removing gemm_multiply_multiply_f8 instance ${source} ")
          list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
     foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_")
+    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_xdl_universal" AND source MATCHES "_f8_")
          message("removing gemm_universal_f8 instance ${source} ")
          list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
     foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "batched_gemm_xdl_universal" AND source MATCHES "_f8_")
+    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "batched_gemm_xdl_universal" AND source MATCHES "_f8_")
          message("removing batched_gemm_universal_f8 instance ${source} ")
          list(REMOVE_ITEM ARGN "${source}")
     endif()
     endforeach()
     foreach(source IN LISTS ARGN)
-    if(NOT INST_TARGETS MATCHES "gfx94" AND source MATCHES "gemm_xdl_universal_streamk" AND source MATCHES "_f8_")
+    if(NOT INST_TARGETS MATCHES "gfx94" AND NOT INST_TARGETS MATCHES "gfx95" AND source MATCHES "gemm_xdl_universal_streamk" AND source MATCHES "_f8_")
          message("removing gemm_universal_streamk_f8 instance ${source} ")
          list(REMOVE_ITEM ARGN "${source}")
     endif()
@@ -102,7 +109,7 @@ function(add_instance_library INSTANCE_NAME)
             if(source MATCHES "_xdl")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             elseif(source MATCHES "_wmma")
-                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030)
+                list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack+ gfx908:xnack- gfx90a:xnack+ gfx90a:xnack- gfx908 gfx90a gfx940 gfx941 gfx942 gfx1030 gfx950)
             elseif(source MATCHES "mha")
                 list(REMOVE_ITEM INST_TARGETS gfx900 gfx906 gfx906:xnack- gfx908:xnack- gfx908:xnack+ gfx908 gfx1030 gfx1100 gfx1101 gfx1102 gfx1103 gfx1200 gfx1201 gfx10.3-generic gfx11-generic gfx12-generic)
             endif()
@@ -183,6 +190,10 @@ FOREACH(subdir_path ${dir_list})
             message("bf8 instance found!")
             set(add_inst 1)
         endif()
+        if(("${cmake_instance}" MATCHES "_bf16" OR "${cmake_instance}" MATCHES "_b16") AND DTYPES MATCHES "bf16")
+            message("bf16 instance found!")
+            set(add_inst 1)
+        endif()
         if(("${cmake_instance}" MATCHES "_fp16" OR "${cmake_instance}" MATCHES "_f16") AND DTYPES MATCHES "fp16")
             message("fp16 instance found!")
             set(add_inst 1)
@@ -195,10 +206,6 @@ FOREACH(subdir_path ${dir_list})
             message("fp64 instance found!")
             set(add_inst 1)
         endif()
-        if("${cmake_instance}" MATCHES "_bf16" AND DTYPES MATCHES "bf16")
-            message("bf16 instance found!")
-            set(add_inst 1)
-        endif()
         if(("${cmake_instance}" MATCHES "_int8" OR "${cmake_instance}" MATCHES "_i8") AND DTYPES MATCHES "int8")
             message("int8 instance found!")
             set(add_inst 1)
@@ -278,9 +285,14 @@ ENDFOREACH()
 
 
 if(CK_DEVICE_OTHER_INSTANCES)
-        add_library(device_other_operations STATIC ${CK_DEVICE_OTHER_INSTANCES})
+        add_library(device_other_operations ${CK_DEVICE_OTHER_INSTANCES})
         add_library(composablekernels::device_other_operations ALIAS device_other_operations)
         set_target_properties(device_other_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        set_target_properties(device_other_operations
+            PROPERTIES
+            VERSION ${CMAKE_PROJECT_VERSION}
+            SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
+        )
         target_include_directories(device_other_operations PUBLIC
             $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>
             $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/utility>
@@ -309,10 +321,15 @@ if(CK_DEVICE_OTHER_INSTANCES)
         )
 endif()
 if(CK_DEVICE_GEMM_INSTANCES)
-        add_library(device_gemm_operations STATIC ${CK_DEVICE_GEMM_INSTANCES})
+        add_library(device_gemm_operations ${CK_DEVICE_GEMM_INSTANCES})
         add_library(composablekernels::device_gemm_operations ALIAS device_gemm_operations)
         target_compile_features(device_gemm_operations PUBLIC)
         set_target_properties(device_gemm_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        set_target_properties(device_gemm_operations
+            PROPERTIES
+            VERSION ${CMAKE_PROJECT_VERSION}
+            SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
+        )
         target_include_directories(device_gemm_operations PUBLIC
             $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
         )
@@ -325,10 +342,15 @@ if(CK_DEVICE_GEMM_INSTANCES)
         )
 endif()
 if(CK_DEVICE_CONV_INSTANCES)
-        add_library(device_conv_operations STATIC ${CK_DEVICE_CONV_INSTANCES})
+        add_library(device_conv_operations ${CK_DEVICE_CONV_INSTANCES})
         add_library(composablekernels::device_conv_operations ALIAS device_conv_operations)
         target_compile_features(device_conv_operations PUBLIC)
         set_target_properties(device_conv_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        set_target_properties(device_conv_operations
+            PROPERTIES
+            VERSION ${CMAKE_PROJECT_VERSION}
+            SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
+        )
         target_include_directories(device_conv_operations PUBLIC
             $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
             $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange>
@@ -346,8 +368,13 @@ if(CK_DEVICE_CONV_INSTANCES)
 endif()
 if(CK_DEVICE_MHA_INSTANCES)
         set(gpu_list ${INST_TARGETS})
-	if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a")
-            add_library(device_mha_operations STATIC ${CK_DEVICE_MHA_INSTANCES})
+        if(gpu_list MATCHES "gfx94" OR gpu_list MATCHES "gfx90a" OR gpu_list MATCHES "gfx95")
+            add_library(device_mha_operations ${CK_DEVICE_MHA_INSTANCES})
+            set_target_properties(device_mha_operations
+                PROPERTIES
+                VERSION ${CMAKE_PROJECT_VERSION}
+                SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
+            )
             add_library(composablekernels::device_mha_operations ALIAS device_mha_operations)
             target_compile_features(device_mha_operations PUBLIC)
             set_target_properties(device_mha_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
@@ -362,10 +389,15 @@ if(CK_DEVICE_MHA_INSTANCES)
         endif()
 endif()
 if(CK_DEVICE_CONTRACTION_INSTANCES)
-        add_library(device_contraction_operations STATIC ${CK_DEVICE_CONTRACTION_INSTANCES})
+        add_library(device_contraction_operations ${CK_DEVICE_CONTRACTION_INSTANCES})
         add_library(composablekernels::device_contraction_operations ALIAS device_contraction_operations)
         target_compile_features(device_contraction_operations PUBLIC)
         set_target_properties(device_contraction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        set_target_properties(device_contraction_operations
+            PROPERTIES
+            VERSION ${CMAKE_PROJECT_VERSION}
+            SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
+        )
         target_include_directories(device_contraction_operations PUBLIC
             $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu>
             $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/contraction>
@@ -379,10 +411,15 @@ if(CK_DEVICE_CONTRACTION_INSTANCES)
         )
 endif()
 if(CK_DEVICE_REDUCTION_INSTANCES)
-        add_library(device_reduction_operations STATIC ${CK_DEVICE_REDUCTION_INSTANCES})
+        add_library(device_reduction_operations ${CK_DEVICE_REDUCTION_INSTANCES})
         add_library(composablekernels::device_reduction_operations ALIAS device_reduction_operations)
         target_compile_features(device_reduction_operations PUBLIC)
         set_target_properties(device_reduction_operations PROPERTIES POSITION_INDEPENDENT_CODE ON)
+        set_target_properties(device_reduction_operations
+            PROPERTIES
+            VERSION ${CMAKE_PROJECT_VERSION}
+            SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
+        )
         target_include_directories(device_reduction_operations PUBLIC
             $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck/library/tensor_operation_instance/gpu/reduce>
         )
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
index 5a9483b309..dad67b396f 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instance.cpp
@@ -27,12 +27,15 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
 using device_batched_gemm_xdl_f16_f16_f16_gkm_gkn_gmn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
index 0fa0719237..62df9a81b5 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instance.cpp
@@ -27,12 +27,15 @@ using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
 using device_batched_gemm_xdl_f16_f16_f16_gkm_gnk_gmn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########|        AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //##########|         Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //##########|             |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //##########|             |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
index dc7de8c689..23e176f6e8 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instance.cpp
@@ -26,23 +26,29 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_generic_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    16,    16,     4, 16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 using device_batched_gemm_xdl_f16_f16_f16_gmk_gkn_gmn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
index cccad7ca19..0216b002ac 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm/device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instance.cpp
@@ -26,23 +26,29 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
 using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_generic_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4, 16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 using device_batched_gemm_xdl_f16_f16_f16_gmk_gnk_gmn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#################| AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CThreadTransfer| CThreadTransfer| NumGemmK|          LoopScheduler|                    Pipeline|
         //#################|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar| Prefetch|                       |                            |
         //#################|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector| Stage   |                       |                            |
         //#################|      |      |      |        |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |         |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceBatchedGemmXdl<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,        1, LoopScheduler::Default,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index cf23d01bf2..6af3c2654e 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm/device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -33,11 +33,13 @@ static constexpr auto GemmPadded  = ck::tensor_operation::device::GemmSpecializa
 template <bool Masking>
 using device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //#######################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|    MaskOut|
         //#######################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|      Upper|
         //#######################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|   Triangle|
         //#######################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |           |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
@@ -59,11 +61,13 @@ using device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_
 template <bool Masking>
 using device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_irregular_k_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //#######################################| ALayout| B0Layout| B1Layout| CLayout| AData| B0Data| B1Data| CData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|    MaskOut|
         //#######################################|        |         |         |        |  Type|   Type|   Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|      Upper|
         //#######################################|        |         |         |        |      |       |       |      |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|   Triangle|
         //#######################################|        |         |         |        |      |       |       |      |        |         |            |            |            |            |            |               |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |           |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    256,   128,    40,    64,    32,   4,   4,    2,   32,   32,     2,     4,     2,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    256,   128,    40,   128,    32,   4,   4,    2,   32,   32,     2,     4,     4,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
         DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<     Row,      Col,      Row,     Row,   F16,    F16,    F16,   F16,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded,        1,   256,    128,   256,    40,    64,    32,   4,   4,    2,   32,   32,     1,     8,     2,     S<2,128, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              4,              4,     false,      S<2,128, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               4,               4,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8,   Masking>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index 498bf58fb3..8382f069d7 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -40,11 +40,13 @@ template <index_t NumDimG,
           MaskingSpecialization MaskingSpec>
 using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|  AData|  B0Data|  B1Data|  CData|     Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|  D0s Bias|
         // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|             Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            | SrcScalar|
         // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            | PerVector|
         // #############################################|         |        |        |        |        |       |        |        |       |                 |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,        1>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<BF16>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 744bd6456d..b6c14d69db 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -40,11 +40,13 @@ template <index_t NumDimG,
           MaskingSpecialization MaskingSpec>
 using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO| AData| B0Data| B1Data| CData|    Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|  D0s Bias|
         // #############################################|         |        |        |        |        |  Type|   Type|   Type|  Type|            Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            | SrcScalar|
         // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            | PerVector|
         // #############################################|         |        |        |        |        |      |       |       |      |                |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |          |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec,       1>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<F16>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,    ScaleAdd, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
index b342612d1c..2ce5124706 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instance.cpp
@@ -40,11 +40,13 @@ template <index_t NumDimG,
           MaskingSpecialization MaskingSpec>
 using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_bf16_bf16_bf16_bf16_gmk_gnk_gno_gmo_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|  AData|  B0Data|  B1Data|  CData| Acc0BiasData| Acc1BiasData| AccData| CShuffle|           A|          B0|        Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|
         // #############################################|         |        |        |        |        |   Type|    Type|    Type|   Type|         Type|         Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
         // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |   Operation|   Operation|   Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
         // #############################################|         |        |        |        |        |       |        |        |       |             |             |        |         |            |            |            |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   BF16,    BF16,    BF16,   BF16,  ck::Tuple<>,  ck::Tuple<>,     F32,     BF16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
diff --git a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
index 3fd0c07370..4e8adceb1c 100644
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance.cpp
@@ -58,8 +58,10 @@ using device_batched_gemm_softmax_gemm_permute_xdl_cshuffle_f16_f16_f16_f16_gmk_
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         // Padded fallback kernel
         DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO,   F16,    F16,    F16,   F16,  ck::Tuple<>,  ck::Tuple<>,     F32,      F16, PassThrough, PassThrough,       Scale, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
         // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
index da96c79a6e..c0c74ff7fb 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -40,11 +40,14 @@ static constexpr auto ConvFwdOddC =
 
 // arbitrary conv
 using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -63,11 +66,14 @@ using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances = std::tuple<
 
 // 1x1, pad 0
 using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   128,     4,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -86,11 +92,14 @@ using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std:
 
 // 1x1, stride 1, pad 0
 using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -108,11 +117,14 @@ using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = s
     >;
 
 using device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,    ConvFwdOddC,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
index e34ea06ff4..715dffb7ff 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -39,11 +39,14 @@ static constexpr auto ConvFwdOddC =
 
 // arbitrary conv
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -62,11 +65,14 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances = s
 
 // 1x1, pad 0
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -85,11 +91,14 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_p0_f16_instan
 
 // 1x1, stride 1, pad 0
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -108,11 +117,14 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_1x1_s1_p0_f16_ins
 
 // Odd C
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##########################################################################################|       |        |        |        |            |            |            |              |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    ConvFwdOddC,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
diff --git a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
index 3254fcfc26..5676d77986 100644
--- a/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/conv2d_fwd_bias_relu_add/device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -37,11 +37,14 @@ static constexpr auto ConvFwdOddC =
 
 // arbitrary conv
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwdDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -60,11 +63,14 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_f16_instances
 
 // 1x1, pad 0
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_p0_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,   ConvFwd1x1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -83,11 +89,14 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_p0_f16_in
 
 // 1x1, stride 1, pad 0
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd, ConvFwd1x1S1P0,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
@@ -106,11 +115,14 @@ using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_1x1_s1_p0_f16
 
 // Odd C
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_add_nhwc_kyxc_nhwk_odd_c_f16_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|    ConvForward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //##############################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise| Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //##############################################################################################|       |        |        |        |   Operation|   Operation|   Operation|               |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //##############################################################################################|       |        |        |        |            |            |            |               |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,   128,     4, 16,   32,   32,    2,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  8, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 32, 1, 1, 8>,               8>,
         DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,  AddReluAdd,    ConvFwdOddC,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              1,      true,     S<4,  4, 8>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              1,      true,           1,           1,             S<1, 1, 16, 1, 1, 8>,               8>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
index 2e884dfc8a..0d143b95ee 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -29,12 +29,15 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 using device_gemm_xdl_c_shuffle_2_stage_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                     Version|
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        2,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
index 60d4ccf525..ce58469697 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instance.cpp
@@ -55,6 +55,11 @@ using device_gemm_xdl_c_shuffle_bf16_bf16_bf16_mk_nk_mn_instances = std::tuple<
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    64,    32,    32,   8,   8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,    64,    32,    64,    32,   8,   8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV2<   Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        2,   256,   256,   256,    32,   8,   8,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        ,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,  BF16,  BF16,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,    64,    64,   128,  32,  32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4,  LoopScheduler::Default,        PipelineVersion::v1>
+#endif
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
         ,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
index 44b6848233..3ebd0c5351 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -31,12 +31,15 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
 using device_gemm_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Col,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
index 23176269c2..bd0f7346f7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -31,12 +31,15 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
 using device_gemm_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Col,      Col,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
index 8c9a96f6b7..a6ac333ea3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -35,10 +35,13 @@ static constexpr auto MNPadding = ck::tensor_operation::device::GemmSpecializati
 static constexpr auto MNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
 
 using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
@@ -46,11 +49,13 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instances = std::tu
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
 using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,      Row,    Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
index b591dacff5..852b053527 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -34,11 +34,16 @@ static constexpr auto MNKPadding  = ck::tensor_operation::device::GemmSpecializa
 
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        //DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
+        //DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,   128,  32,  32,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              4,  LoopScheduler::Default,        PipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,     MNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 8>,              1,  LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
@@ -46,11 +51,13 @@ using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instances = std::tu
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
 // Compilation parameters for a[m, k] * b[n, k] = c[m, n]
 using device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#####################| ALayout| BLayout| CLayout| AData| BData| CData| AccData| CShuffle|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //#####################|        |        |        |  Type|  Type|  Type|    Type| DataType| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //#####################|        |        |        |      |      |      |        |         |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row,   F16,   F16,   F16,     F32,      F16, PassThrough, PassThrough, PassThrough,       GemmSpec,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8,  LoopScheduler::Default,        PipelineVersion::v1>,
@@ -110,7 +117,6 @@ void add_device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
 {
     add_device_operation_instances(
         instances, device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instances{});
-
     add_device_operation_instances(
         instances, device_gemm_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances<GemmDefault>{});
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
index 0ade7a61ce..41f6ec2bf7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instance.cpp
@@ -47,7 +47,12 @@ using device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances =
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   128,    32,   128,    64,  16,  16,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 4>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    64,    32,    64,  16,  16,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>,
         DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,    64,    32,    64,    64,  16,  16,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 2>,             16>
-        // clang-format on
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        ,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,   128,   128,   128,  32,  32,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 64, 1, 4>,              16,  LoopScheduler::Default,        PipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle<     Row,     Col,     Row, int8_t, int8_t, int8_t, int32_t,  int32_t, PassThrough, PassThrough, PassThrough,    GemmDefault,        1,   256,    64,    64,   256,  64,  64,   16,   16,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,             16,             16,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,             16,             16,         1,           1,           1,               S<1, 32, 1, 8>,               4,  LoopScheduler::Default,        PipelineVersion::v1>
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
+       // clang-format on
         >;
 
 void add_device_gemm_xdl_c_shuffle_i8_i8_i8_mk_nk_mn_instances(
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
index a64424e8ac..74cf837500 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_interwave_pipeline_v1_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -18,6 +17,8 @@ using Instances =
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |         |
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |         |
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |         |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
@@ -27,8 +28,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_kn_mn_interwave_pipeline_v1_instances(
     OwnerList<InstanceNT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
index 0a0406baec..f2b28f3b40 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
@@ -17,6 +17,8 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                         |                           |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                         |                           |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                         |                           |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Col,     Row,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   64,     16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1, LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
     // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
index 3671bea7a3..da5fefe5da 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_interwave_pipeline_v1_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[k, m] * b[n, k] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -18,6 +17,8 @@ using Instances =
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
@@ -27,8 +28,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_km_nk_mn_interwave_pipeline_v1_instances(
     OwnerList<InstanceNN>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
index 95fc8ecb46..b6c03b3367 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/km_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
@@ -17,6 +17,8 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Col,      Col,    Row, PassThrough, PassThrough, PassThrough,   GemmMNPadding,   64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
     // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
index fa53a3bf0f..bf271cc3c3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_interwave_pipeline_v1_instance.cpp
@@ -9,8 +9,7 @@ namespace device {
 namespace instance {
 
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
-using Instances =
-    std::tuple<
+using Instances = std::tuple<
 // clang-format off
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
@@ -18,6 +17,8 @@ using Instances =
         //##########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //##########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //##########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
@@ -36,8 +37,8 @@ using Instances =
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,    16,    32,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,   GemmDefault,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
-        // clang-format on
-        >;
+    // clang-format on
+    >;
 
 void add_device_gemm_xdl_f16_f16_f16_mk_kn_mn_interwave_pipeline_v1_instances(
     OwnerList<InstanceTT>& instances)
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
index c9d1913aec..0df59933c2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_kn_mn_irregular_interwave_pipeline_v1_instance.cpp
@@ -17,6 +17,8 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    16,    16,     4,  8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
 #endif
     // clang-format on
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp
index 0410eabb70..d9260d85ab 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_interwave_pipeline_v1_instance.cpp
@@ -17,6 +17,8 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,   GemmDefault,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
index a41919aab7..8b98133ada 100644
--- a/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm/device_gemm_xdl_f16_f16_f16/mk_nk_mn_irregular_interwave_pipeline_v1_instance.cpp
@@ -17,6 +17,8 @@ using Instances = std::tuple<
         //###########|  Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|            |                       |                             |
         //###########|      |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|            |                       |                             |
         //###########|      |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |            |                       |                             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     8,  8,   16,   16,    2,    9,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<8,  8, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>,
         DeviceGemmXdl<   F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough, GemmMNPadding,   256,   128,   144,     4,  8,   16,   16,    2,    9,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 4>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              2,              2,      true,               7,               1,           1,  LoopScheduler::Interwave,        PipelineVersion::v1>
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
index 125fbc21a2..9ea79b1467 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instance.cpp
@@ -38,23 +38,29 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n]
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_generic_instance =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -117,12 +123,15 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn
 // irregular tile size
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_kn_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
index cc33692d73..f6a959a447 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instance.cpp
@@ -38,23 +38,29 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[n, k], d0[m, n], d1[m, n]
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_generic_instance =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -117,12 +123,15 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn
 // irregular tile size
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_km_nk_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
index 704787a080..8a68a8bb76 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -38,23 +38,29 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[m, k], b[k, n], d0[m, n], d1[m, n]
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_generic_instance =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -117,12 +123,15 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn
 // irregular tile size
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_kn_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
index d64c9ec5e1..42f7a10b9f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -38,23 +38,29 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[m, k], b[n, k], d0[m, n], d1[m ,n]
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_generic_instance =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    64,  16,  16,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -108,12 +114,15 @@ using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn
 // irregular tile size
 using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_f16_mk_nk_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,   F16, PassThrough, PassThrough, AddAddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
index e68bd8e7e4..6d1c16776a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instance.cpp
@@ -23,22 +23,28 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n]
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_generic_instance =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -101,12 +107,15 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_instanc
 // irregular tile size
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_kn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
index 5aaa2e8fe5..8b746781ef 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instance.cpp
@@ -23,22 +23,28 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[n, k], d0[m, n], d1[m, n]
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_generic_instance =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -101,12 +107,15 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_instanc
 // irregular tile size
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_km_nk_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
index 7a2a3dbaf3..414ab674d3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instance.cpp
@@ -23,22 +23,28 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[m, k], b[k, n], d0[m, n], d1[m, n]
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_generic_instance =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -101,12 +107,15 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_instanc
 // irregular tile size
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_kn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
index fa33609978..cf6a64eb26 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_fastgelu/device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instance.cpp
@@ -23,22 +23,28 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[m, k], b[n, k], d0[m, n], d1[m ,n]
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_generic_instance =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
         // clang-format on
         >;
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|        Ds|      E| AData| BData| AccData| CShuffle|    DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|    Layout| Layout|  Type|  Type|    Type| DataType|      Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |          |       |      |      |        |         |          |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_Tuple,   F16, PassThrough, PassThrough, AddFastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -92,12 +98,15 @@ using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_instanc
 // irregular tile size
 using device_gemm_add_fastgelu_xdl_c_shuffle_f16_f16_f16_f16_mk_nk_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,     Row_Tuple,    Row,   F16,   F16,     F32,      F32,     F16_Tuple,   F16, PassThrough, PassThrough,    AddFastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
index 28a452c1a1..184d7974d8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instance.cpp
@@ -37,11 +37,13 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n], gamma[n], beta[n]
 template <LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|            CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|     LoopScheduler|     Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise|    Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|      Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |               |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                  |             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,     AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,     AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,     AddReluAdd, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
@@ -64,11 +66,13 @@ using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_instan
 // irregular tile size
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_kn_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|            CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|           PostShuffle|     PostShuffle|            Layernorm|       Layernorm|          LoopScheduler|              Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise|    Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|  ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|      Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                  _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |               |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                      |                |                     |                |                       |                      |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,     AddReluAdd, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<16, 4>,              1,             S<16, 4>,               1, LoopScheduler::Default,  PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
index 13366238d6..ef37c82c7f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instance.cpp
@@ -37,11 +37,13 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n], gamma[n], beta[n]
 template <LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|     LoopScheduler|     Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                  |             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
@@ -64,11 +66,13 @@ using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_instan
 // irregular tile size
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_km_nk_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|          LoopScheduler|               Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                       |                       |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                       |                       |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                       |                       |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Col,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,              S<16, 4>,              1,             S<16, 4>,               1, LoopScheduler::Default,   PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
index 8a4889ee83..40fbc85be0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instance.cpp
@@ -37,11 +37,13 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n], gamma[n], beta[n]
 template <LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|     LoopScheduler|     Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                  |             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
@@ -64,11 +66,13 @@ using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_instan
 // irregular tile size
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_kn_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|          LoopScheduler|              Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                       |                      |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Row, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<16, 4>,              1,             S<16, 4>,               1, LoopScheduler::Default,  PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
diff --git a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
index fc3cbcf905..464279c376 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm/device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instance.cpp
@@ -37,11 +37,13 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // input: a[k, m], b[k, n], d0[m, n], d1[m, n], gamma[n], beta[n]
 template <LoopScheduler GemmLoopScheduler, PipelineVersion GemmPipeline>
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|     LoopScheduler|     Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                  |             |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                  |             |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<32, 8>,               8,             S<32, 8>,               1, GemmLoopScheduler, GemmPipeline>,
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,             S<16, 8>,               8,             S<16, 8>,               1, GemmLoopScheduler, GemmPipeline>,
@@ -61,11 +63,13 @@ using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_instan
 // irregular tile size
 using device_gemm_add_relu_add_xdl_c_shuffle_layernorm_f16_mk_nk_mn_mn_mn_irregular_tile_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //#######################################|      A|      B|            Ds|      H| AData| BData| AccData| CShuffle|        DsData| EMeanVarData| GammaData|  BetaData| HData|           A|           B|         CDE|           H|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|          PostShuffle|     PostShuffle|            Layernorm|       Layernorm|          LoopScheduler|              Pipeline|
         //#######################################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|         Type|      Type|      Type|  Type| Elementwise| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| ThreadClusterLengths| ScalarPerVector| ThreadClusterLengths| ThreadSliceSize|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |   Operation|   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|                 _M_N|   _NWaveNPerXdl|                 _M_N|              _M|                       |                      |
         //#######################################|       |       |              |       |      |      |        |         |              |             |          |          |      |            |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                     |                |                     |                |                       |                      |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // pipeline v1, 1 wave
         DeviceGemmMultipleDLayernorm_Xdl_CShuffle<    Row,    Col, Row_Row_Tuple,    Row,   F16,   F16,     F32,      F32, F16_F16_Tuple,          F16,       F16,       F16,   F16, PassThrough, PassThrough,  AddReluAdd, PassThrough, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<16, 4>,              1,             S<16, 4>,               1, LoopScheduler::Default,  PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
index 803c44c7f5..e2bf62ca94 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -22,22 +22,28 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // outout: e[m, n]
 // input: a[k, m], b[k, n]
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_generic_instance = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   2,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -99,12 +105,15 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::t
 
 // irregular tile size
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_irregular_tile_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
index 9b9ef3db24..bbf2836cf8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
@@ -22,22 +22,28 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // outout: e[m, n]
 // input: a[k, m], b[k, n]
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_generic_instance = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   2,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   2,   8,   32,   32,    2,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              2,         0,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -99,12 +105,15 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances = std::t
 
 // irregular tile size
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_irregular_tile_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Col,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
index 1a0b6c9d1c..f22739e1a6 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -22,22 +22,28 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // outout: e[m, n]
 // input: a[m, k], b[k, n]
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_generic_instance = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   2,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   2,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,         0,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -99,12 +105,15 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances = std::t
 
 // irregular tile size
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Row,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
index 18b1c0e993..b049a5bac1 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_fastgelu/device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -22,22 +22,28 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // outout: e[m, n]
 // input: a[m, k], b[n, k]
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_generic_instance = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
     // clang-format on
     >;
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|         CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   128,    64,  16,  16,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col, Empty_Tuple,    Row,   F16,   F16,     F32,      F32, Empty_Tuple,   F16, PassThrough, PassThrough,    FastGelu,    GemmDefault,        1,   128,   128,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 16, 1, 8>,               8, LoopScheduler::Default,        PipelineVersion::v1>,
@@ -90,12 +96,15 @@ using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances = std::t
 
 // irregular tile size
 using device_gemm_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_irregular_tile_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //##############################|      A|      B|            Ds|      E| AData| BData| AccData| CShuffle|        DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|          LoopScheduler|                    Pipeline|
         //##############################| Layout| Layout|        Layout| Layout|  Type|  Type|    Type| DataType|          Type|  Type| Elementwise| Elementwise|    Elementwise| Specialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |   Operation|   Operation|      Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|                       |                            |
         //##############################|       |       |              |       |      |      |        |         |              |      |            |            |               |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                       |                            |
         // pipeline v1, 1 wave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    64,  16,  16,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmMultipleD_Xdl_CShuffle<    Row,    Col,   Empty_Tuple,    Row,   F16,   F16,     F32,      F32,   Empty_Tuple,   F16, PassThrough, PassThrough,       FastGelu, GemmMNKPadding,        1,    64,    16,    16,    32,   8,   8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              1,              8,         1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              1,              8,         1,           1,           1,               S<1, 16, 1, 4>,               1, LoopScheduler::Default,        PipelineVersion::v1>
 #if CK_EXPERIMENTAL_INTER_WAVE_INSTANCES        
         // pipeline v1, 2 waves
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
index f94d143418..17da9ce5b7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply/device_gemm_multiply_multiply_xdl_f8_f8_bf16/device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn.hpp
@@ -50,6 +50,7 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_part1
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   160,   128,  16,  16,  32,   32,    2,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,   128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    96,   128,  16,  16,  32,   32,    2,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    96,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    64,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   256,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,   224,   128,  16,  16,  32,   32,    1,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
@@ -73,6 +74,13 @@ using device_gemm_multiply_multiply_xdl_f8_f8_bf16_mk_nk_mn_comp_instances_part2
 #if defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
 
         // Compute friendly
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   192,    128,  128,  16,  16,  32,   32,    3,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   160,   128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   192,    96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    96,   128,   128,  16,  16,  16,   16,    3,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    96,   128,   256,  16,  16,  16,   16,    3,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    80,   128,   256,  16,  16,  16,   16,    5,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,   256,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,   224,   128,  16,  16,  16,   16,    2,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemmMultiD_Xdl_CShuffle_V3<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,   192,   256,  16,  16,  32,   32,    1,    3,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/CMakeLists.txt
index 744cbc9bcf..943b2bf4c7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/CMakeLists.txt
@@ -5,30 +5,38 @@ list(APPEND GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
-        # device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_padding_instance.cpp
-        # device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_padding_instance.cpp
-        # device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_padding_instance.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp
 
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
-        # device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_padding_instance.cpp
-        # device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_padding_instance.cpp
-        # device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_padding_instance.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance.cpp
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp
         )
 
 set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-# set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_padding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-# set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_padding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-# set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_padding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-# set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_padding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-# set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_padding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
-# set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_padding_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance_v2.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
+set_source_files_properties(device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp PROPERTIES COMPILE_OPTIONS ";-mllvm;-greedy-reverse-local-assignment=1")
 
 add_instance_library(device_gemm_multiply_multiply_weight_preshuffle_instance ${GEMM_MULTIPLY_MULTIPLY_WEIGHT_PRESHUFFLE_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp
new file mode 100644
index 0000000000..71383f5dc1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     BF16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp
index dd7cbab59c..a138452295 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp
@@ -34,7 +34,10 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
-template <GemmSpecialization GemmSpec>
+static constexpr auto v1 = BlockGemmPipelineVersion::v1;
+static constexpr auto v2 = BlockGemmPipelineVersion::v2;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_instances =
     std::tuple<
 // clang-format off
@@ -43,23 +46,23 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
 #if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    128,   128,  16,  16,  32,   32,    8,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    128,   128,  16,  16,  32,   32,    8,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    256,   128,  16,  16,  32,   32,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    256,   128,  16,  16,  32,   32,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
 #endif
         // clang-format on
         >;
 
-template <GemmSpecialization GemmSpec>
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_instances =
     std::tuple<
 // clang-format off
@@ -68,25 +71,25 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
 #if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   256,  16,  16,  32,   32,    4,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   256,  16,  16,  32,   32,    4,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   256,  16,  16,  32,   32,    4,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   256,  16,  16,  32,   32,    4,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
 #endif
         // clang-format on
         >;
 
-template <GemmSpecialization GemmSpec>
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_instances =
     std::tuple<
 // clang-format off
@@ -95,13 +98,93 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
 #if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+#endif
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_instances =
+    std::tuple<
+// clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_instances =
+    std::tuple<
+// clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Compute friendly
+        // 256x[64, 256, 32]x128
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    224,   128,  16,  16,  16,   16,    8,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    192,   128,  16,  16,  16,   16,    8,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    160,   128,  16,  16,  16,   16,    8,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     96,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // 224x[64, 256, 32]x128
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    192,   128,  16,  16,  16,   16,    7,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // 192x[64, 256, 32]x128, 192x[64]x256
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    224,   128,  16,  16,  16,   16,    6,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    192,   128,  16,  16,  16,   16,    6,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    160,   128,  16,  16,  16,   16,    6,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    224,   128,  16,  16,  16,   16,    5,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    192,   128,  16,  16,  16,   16,    5,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    160,   128,  16,  16,  16,   16,    5,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     96,   128,  16,  16,  16,   16,    5,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // 128x[64, 256, 32]x128, 128x[64, 128, 32]x256
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, BF16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
 #endif
         // clang-format on
         >;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp
index 8f1b60815a..e3ff079d99 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn.hpp"
 
@@ -8,7 +8,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_compute_default_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma16x16_mn_compute_default_instances(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
                                                                      Tuple<Row, Col>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
index 90f6e06947..6e9b3ea172 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_default_instance.cpp
@@ -24,6 +24,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_
     add_device_operation_instances(
         instances,
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p1_instances<
+            v1,
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
index 4128c81f68..8557d0c80e 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_default_instance.cpp
@@ -24,6 +24,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_
     add_device_operation_instances(
         instances,
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p2_instances<
+            v1,
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
index bc6846a3c8..84c2c70e35 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_default_instance.cpp
@@ -24,6 +24,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_
     add_device_operation_instances(
         instances,
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_bf16_mk_mfma_mn_p3_instances<
+            v1,
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp
new file mode 100644
index 0000000000..05529f9cdd
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instance.cpp
@@ -0,0 +1,33 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_default_instances(
+    std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
+                                                                     Col,
+                                                                     Tuple<Row, Col>,
+                                                                     Row,
+                                                                     F8,
+                                                                     F8,
+                                                                     Tuple<F32, F32>,
+                                                                     F16,
+                                                                     PassThrough,
+                                                                     PassThrough,
+                                                                     MultiplyMultiply>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances<
+            GemmDefault>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp
index e09554c545..c4f53e834a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn.hpp
@@ -34,7 +34,10 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
-template <GemmSpecialization GemmSpec>
+static constexpr auto v1 = BlockGemmPipelineVersion::v1;
+static constexpr auto v2 = BlockGemmPipelineVersion::v2;
+
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_instances =
     std::tuple<
 // clang-format off
@@ -43,23 +46,23 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
 #if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    128,   128,  16,  16,  32,   32,    8,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    128,   128,  16,  16,  32,   32,    8,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   128,  16,  16,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   128,  16,  16,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    256,   128,  16,  16,  32,   32,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   256,    256,   128,  16,  16,  32,   32,    8,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   128,  16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   128,  16,  16,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   128,  16,  16,  32,   32,    2,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   128,  16,  16,  32,   32,    1,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
 #endif
         // clang-format on
         >;
 
-template <GemmSpecialization GemmSpec>
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_instances =
     std::tuple<
 // clang-format off
@@ -68,25 +71,25 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
 #if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   256,  16,  16,  32,   32,    4,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    128,   256,  16,  16,  32,   32,    4,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   256,  16,  16,  32,   32,    2,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   256,  16,  16,  32,   32,    1,    1,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    128,   512,  16,  16,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    128,   512,  16,  16,  32,   32,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 256
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   256,  16,  16,  32,   32,    4,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,   128,    256,   256,  16,  16,  32,   32,    4,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   256,  16,  16,  32,   32,    2,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   256,  16,  16,  32,   32,    1,    2,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    256,   512,  16,  16,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    256,   512,  16,  16,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
         // N 512
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    64,    512,   256,  16,  16,  32,   32,    2,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,    32,    512,   256,  16,  16,  32,   32,    1,    4,     S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,     S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
 #endif
         // clang-format on
         >;
 
-template <GemmSpecialization GemmSpec>
+template <BlockGemmPipelineVersion BlkGemmPipeVer, GemmSpecialization GemmSpec>
 using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_instances =
     std::tuple<
 // clang-format off
@@ -95,13 +98,93 @@ using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p
         //################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
 #if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>,
-        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1, F8>
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   256,  16,  16,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   256,  16,  16,  16,   16,    1,    2,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   256,  16,  16,  16,   16,    1,    4,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    512,   256,  16,  16,  16,   16,    1,    8,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    64,   512,  16,  16,  16,   16,    1,    1,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 16>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    128,   512,  16,  16,  16,   16,    1,    2,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     16,    256,   512,  16,  16,  16,   16,    1,    4,     S<32, 8, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 16, 1, 16>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlkGemmPipeVer, F8>
+#endif
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_compute_instances =
+    std::tuple<
+// clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  32,   32,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  32,   32,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  32,   32,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  32,   32,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  32,   32,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  32,   32,    7,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  32,   32,    6,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  32,   32,    5,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  32,   32,    4,    1,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
+#endif
+        // clang-format on
+        >;
+
+template <GemmSpecialization GemmSpec>
+using device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma16x16_mn_compute_instances =
+    std::tuple<
+// clang-format off
+        //############################################| ALayout| BLayout|         DsLayout| ELayout|AData| BData|          DsData| EData| AccData| Cshuffle|           A|           B|              C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
+        //############################################|        |        |                 |        | Type|  Type|            Type|  Type|    Type|     Type| Elementwise| Elementwise|    Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |   Operation|   Operation|      Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
+        //############################################|        |        |                 |        |     |      |                |      |        |         |            |            |               |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
+#if defined(__gfx94__) || defined(CK_USE_GFX94) || defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH)
+        // Compute friendly
+        // 256x[64, 256, 32]x128
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    224,   128,  16,  16,  16,   16,    8,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    192,   128,  16,  16,  16,   16,    8,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    160,   128,  16,  16,  16,   16,    8,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,    128,   128,  16,  16,  16,   16,    8,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     96,   128,  16,  16,  16,   16,    8,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     256,     64,   128,  16,  16,  16,   16,    8,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // 224x[64, 256, 32]x128
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    256,   128,  16,  16,  16,   16,    7,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    224,   128,  16,  16,  16,   16,    7,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    192,   128,  16,  16,  16,   16,    7,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    160,   128,  16,  16,  16,   16,    7,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,    128,   128,  16,  16,  16,   16,    7,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     96,   128,  16,  16,  16,   16,    7,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     224,     64,   128,  16,  16,  16,   16,    7,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // 192x[64, 256, 32]x128, 192x[64]x256
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    256,   128,  16,  16,  16,   16,    6,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    224,   128,  16,  16,  16,   16,    6,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    192,   128,  16,  16,  16,   16,    6,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    160,   128,  16,  16,  16,   16,    6,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,    128,   128,  16,  16,  16,   16,    6,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     96,   128,  16,  16,  16,   16,    6,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     192,     64,   128,  16,  16,  16,   16,    6,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // 160x[64, 256, 32]x128, 160x[64, 96, 32]x256
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    256,   128,  16,  16,  16,   16,    5,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    224,   128,  16,  16,  16,   16,    5,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    192,   128,  16,  16,  16,   16,    5,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    160,   128,  16,  16,  16,   16,    5,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,    128,   128,  16,  16,  16,   16,    5,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     96,   128,  16,  16,  16,   16,    5,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,    S<4, 4, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     160,     64,   128,  16,  16,  16,   16,    5,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        // 128x[64, 256, 32]x128, 128x[64, 128, 32]x256
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    256,   128,  16,  16,  16,   16,    4,    8,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    224,   128,  16,  16,  16,   16,    4,    7,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    192,   128,  16,  16,  16,   16,    4,    6,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    160,   128,  16,  16,  16,   16,    4,    5,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   128,  16,  16,  16,   16,    4,    4,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   128,  16,  16,  16,   16,    4,    3,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   128,  16,  16,  16,   16,    4,    2,     S<8, 32, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,    128,   256,  16,  16,  16,   16,    4,    4,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     96,   256,  16,  16,  16,   16,    4,    3,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+        DeviceGemmMultiD_Xdl_CShuffle_V3_BPreshuffle<  Row,     Col,     Tuple<Row, Col>,  Row,    F8,    F8,    Tuple<F32, F32>, F16,  F32,     F32,     PassThrough, PassThrough, MultiplyMultiply,    GemmSpec,   256,     128,     64,   256,  16,  16,  16,   16,    4,    2,     S<16, 16, 1>,   S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,    S<8, 8, 1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>
 #endif
         // clang-format on
         >;
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
index 025c65dc7f..cb15688e6f 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instance.cpp
@@ -8,7 +8,7 @@ namespace tensor_operation {
 namespace device {
 namespace instance {
 
-void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances(
+void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_default_instances_v2(
     std::vector<std::unique_ptr<DeviceGemmMultipleDSplitKBPreShuffle<Row,
                                                                      Col,
                                                                      Tuple<Row, Col>,
@@ -24,6 +24,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_m
     add_device_operation_instances(
         instances,
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p1_instances<
+            v2,
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
index 250f8279d5..c9ab9c1071 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_default_instance.cpp
@@ -24,6 +24,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_m
     add_device_operation_instances(
         instances,
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p2_instances<
+            v1,
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
index f7a8a925ce..fb43347ceb 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_multiply_multiply_weight_preshuffle/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16/device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_default_instance.cpp
@@ -24,6 +24,7 @@ void add_device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_m
     add_device_operation_instances(
         instances,
         device_gemm_multiply_multiply_weight_preshuffle_xdl_f8_f8_f16_mk_mfma_mn_p3_instances<
+            v1,
             GemmDefault>{});
 }
 
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instance.cpp
index efc7a7ebfd..385f474ecc 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instance.cpp
@@ -33,11 +33,13 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
 using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_iw_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         //PipelineVersion::v1; interwave
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instance.cpp
index 6a323d323f..0b48bbf606 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instance.cpp
@@ -34,11 +34,13 @@ template <ck::tensor_operation::device::GemmSpecialization GemmSpec,
           ck::PipelineVersion PipVer,
           ck::LoopScheduler LoopSche>
 using device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_irregular_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    16,   128,     4,  8,   16,   16,    1,    4,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               4,  F16, PipVer, LoopSche>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    16,   256,     4,  8,   16,   16,    1,    8,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               4,  F16, PipVer, LoopSche>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,      Row,    Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,    16,   256,     4,  8,   16,   16,    1,    4,  S<1, 4, 16, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,      true,           1,           1,                   S<1, 16, 1, 16>,              4,  F16, PipVer, LoopSche>,        
diff --git a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instance.cpp
index 2855235f97..422db05b35 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_splitk/device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instance.cpp
@@ -33,12 +33,14 @@ static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecial
 // Compilation parameters for a[m, k] * b[k, n] = c[m, n]
 template <ck::tensor_operation::device::GemmSpecialization GemmSpec>
 using device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_iw_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C|          GEMM| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
         //#########################| Type|  Type|  Type|    Type|        |        |        | Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|
         //#########################|     |      |      |        |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|
         //#########################|     |      |      |        |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |
         //PipelineVersion::v1; interwave
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,     4,  8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 32, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
         DeviceGemmXdlSplitKCShuffle<  F16,   F16,   F16,     F32,     Row,     Col,     Row, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,   128,   128,     4,  8,   32,   32,    4,    2,  S<1, 4, 32, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,      true,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             3,              8,              8,      true,           1,           1,                   S<1, 16, 1, 8>,               8,  F16, PipelineVersion::v1, LoopScheduler::Interwave>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
index 188c9f68e3..ade65eacf3 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/CMakeLists.txt
@@ -97,6 +97,9 @@ list(APPEND GEMM_UNIVERSAL_INSTANCES
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_default_instance.cpp
           device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
 
+          device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_v2_default_instance.cpp
+          device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+
           device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_default_instance.cpp
           device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_kpadding_instance.cpp
           device_gemm_xdl_universal_f8_f16_f16/device_gemm_xdl_universal_f8_f16_f16_mk_kn_mn_comp_mnpadding_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
index 3300c4b0f7..5540d2d884 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn.hpp
@@ -36,12 +36,13 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,               1,              8,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              2,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -59,12 +60,13 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_comp_instances = std::tu
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_kn_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,               1,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
index d7b0051186..d9f9969621 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn.hpp
@@ -36,14 +36,17 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -53,7 +56,9 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tu
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   8,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,                4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   8,  32,   32,    2,    2,     S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,              1,              8,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -64,12 +69,13 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_comp_instances = std::tu
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -84,7 +90,9 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_km_nk_mn_mem_instances = std::tup
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              4,              4,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   8,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   4,   4,  16,   16,    1,    2,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   4,   8,  16,   16,    1,    4,     S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              4,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3<  Col,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   2,   8,  16,   16,    1,    4,     S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,              1,              2,              2,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
index 99082367d7..0619a98cf0 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -36,19 +36,24 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
@@ -57,21 +62,32 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tu
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   64,   128,   8,   4,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
 
         // Memory friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   256,   8,   8,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   256,   8,   8,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   64,   256,   8,   8,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
index 19ee15c6eb..3edbd28cd8 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -36,17 +36,21 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         // AGPR Spill
         // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -54,6 +58,7 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tu
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   192,    64,   8,   8,  32,   32,    4,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
@@ -62,12 +67,13 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances = std::tu
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -76,11 +82,19 @@ using device_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_mem_instances = std::tup
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   64,   128,   8,   8,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
 
         // Memory friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   16,   256,   8,   8,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   256,   8,   8,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   256,   8,   8,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   256,   8,   8,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   64,   256,   8,   8,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   4,   4,  16,   16,    4,    1,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
index 889e96de50..e3f3afff46 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -34,48 +34,65 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,    64,   8,   2,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,    64,    64,   8,   2,  32,   32,    1,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   192,    64,    64,   8,   2,  32,   32,    3,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   192,    64,    64,   8,   2,  32,   32,    3,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32,  8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
     // clang-format on
     >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   64,   128,   8,   4,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
 
         // Memory friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   256,   8,   8,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   256,   8,   8,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   64,   256,   8,   8,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   2,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   2,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   2,   2,  16,   16,    4,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   4,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   4,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   4,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   4,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   4,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   4,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    64,   8,   4,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
index 22b3bb9ae0..e39c9a63b9 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn.hpp
@@ -34,12 +34,13 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -56,11 +57,13 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  16,   16,    8,    8,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         // AGPR Spill
         // DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   8,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -71,25 +74,46 @@ using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   128,   8,   8,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   64,   128,   8,   8,  16,   16,    1,    1,     S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
-
         // Memory friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    64,   16,   256,   8,   8,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   256,   8,   8,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   256,   8,   8,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   256,   8,   8,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,   64,   256,   8,   8,  16,   16,    1,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   4,   4,  32,   32,    2,    1,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   8,   8,  32,   32,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   4,   4,  32,   32,    2,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   32,    64,   2,   2,  32,   32,    2,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,   128,   16,    64,   8,   8,  16,   16,    4,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   32,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    64,   16,    64,   8,   8,  16,   16,    2,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   8,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,    64,   8,   8,  16,   16,    1,    1,     S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8,  8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   64,    64,   8,   8,  32,   32,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,  128,    64,   8,   8,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   8,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   4,   4,  32,   32,    1,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
+        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   2,   2,  32,   32,    1,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp
index f2eb52b49a..43dc6be076 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn.hpp
@@ -35,18 +35,21 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         // Disable due to test failure
         // DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   4,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
@@ -55,12 +58,13 @@ using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_comp_instances = std::tuple<
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_kn_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   256,   8,   4,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F16,   F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   32,   256,   8,   4,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp
index 78d16670cb..9bdb2f51c2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_f8_f16/device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn.hpp
@@ -35,15 +35,18 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
@@ -52,12 +55,13 @@ using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_comp_instances = std::tuple<
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_f8_f16_mk_nk_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly 
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,   128,   8,   16,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    F8,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   16,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 8, 1>,      S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
index 8666cf8589..616133e1ba 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn.hpp
@@ -47,9 +47,11 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_kn_mn_comp_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,   4,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,   8,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,   4,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,   128,  16,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<32,  8, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    64,  16,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,             16,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Row,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    64,  16,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<16, 16, 1>,    S<0, 2, 1>,    S<0, 2, 1>,               1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1, F8>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
index ab3225b2a7..6388c13444 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f8_f8_bf16/device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn.hpp
@@ -47,7 +47,9 @@ using device_gemm_xdl_universal_f8_f8_bf16_mk_nk_mn_comp_instances = std::tuple<
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,  16,  16,  32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,   128,  16,  16,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    64,  16,  16,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    128, 16,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    128, 16,  16,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   192,    128, 16,  16,  32,   32,    4,    3,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
         DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F8,     F8,    BF16,   F32,     BF16,  PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    128, 16,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,             16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3, F8>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
index 21cef335c5..7fb690c8b2 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_batched/device_batched_gemm_xdl_universal_bf16_bf16_bf16/device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -50,7 +50,9 @@ using device_batched_gemm_xdl_universal_bf16_bf16_bf16_mk_nk_mn_comp_instances =
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,         S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,          S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   160,    64,   8,   8,  16,   16,    8,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,          S<4>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceBatchedGemmMultiD_Xdl_CShuffle_V3<  Row,     Col, DsLayout,     Row,     BF16,   BF16, DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   160,    64,   8,   8,  32,   32,    1,    5,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 64, 1, 4>,          S<8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
index 83a898df61..68c6ce89ab 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_bf16_bf16/device_gemm_xdl_universal_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -39,17 +39,20 @@ template <GemmSpecialization GemmSpec,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout|  DsLayout| CLayout| AData| BData|      DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|                Block-wiseGemm|
         //#########################|        |        |          |        |  Type|  Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                      Pipeline|
         //#########################|        |        |          |        |      |      |            |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                      Verision|
         //#########################|        |        |          |        |      |      |            |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                              |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -63,12 +66,13 @@ template <BlockGemmPipelineScheduler BlkGemmPipeSched,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_bf16_bf16_bf16_mk_kn_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout|  DsLayout| CLayout|  AData| BData|      DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|                Block-wiseGemm|
         //#########################|        |        |          |        |   Type|  Type|        Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                      Pipeline|
         //#########################|        |        |          |        |       |      |            |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                      Verision|
         //#########################|        |        |          |        |       |      |            |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                              |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,   BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    32,    16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,   BF16,   BF16, DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,    64,    16,    16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp
index cfdf5fb67c..4e1e5567d5 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_bf16_i8_bf16/device_gemm_xdl_universal_bf16_i8_bf16_mk_kn_mn.hpp
@@ -40,12 +40,15 @@ template <GemmSpecialization GemmSpec,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| DsLayout| CLayout|AData| BData|    DsData|  CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                          Block-wiseGemm|                Block-wiseGemm|
         //#########################|        |        |         |        | Type|  Type|      Type|   Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                                Pipeline|                      Pipeline|
         //#########################|        |        |         |        |     |      |          |       |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                               Scheduler|                      Verision|
         //#########################|        |        |         |        |     |      |          |       |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                        |                              |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        //DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,   128,  16,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+        DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    32,   128,   128,  16,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v2>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,     F32,     BF16, PassThrough, PassThrough, PassThrough,      GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,               8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -62,12 +65,14 @@ template <BlockGemmPipelineScheduler BlkGemmPipeSched,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| DsLayout| CLayout|AData| BData|     DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|                Block-wiseGemm|
         //#########################|        |        |         |        | Type|  Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                      Pipeline|
         //#########################|        |        |         |        |     |      |           |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                      Verision|
         //#########################|        |        |         |        |     |      |           |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                              |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        //DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    32,  128,   128,  16,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+#else
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,    64,    16,   16,   256,   8,   4,  16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    16,   32,   256,   8,   4,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -80,6 +85,7 @@ using device_gemm_xdl_universal_reduce_bf16_i8_bf16_mk_kn_mn_mem_instances = std
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   128,    32,  128,    64,   8,   4,  32,   32,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row, DsLayout,     Row, BF16,   I8,  DsDataType,  BF16,   F32,     BF16,   PassThrough, PassThrough, PassThrough,      GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              16,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
index a81b5c419e..928c325ab7 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_reduce/device_gemm_xdl_universal_f16_f16_f16/device_gemm_xdl_universal_f16_f16_f16_mk_kn_mn.hpp
@@ -39,17 +39,20 @@ template <GemmSpecialization GemmSpec,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout|  DsLayout| CLayout|AData| BData|     DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |          |        | Type|  Type|       Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |          |        |     |      |           |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |          |        |     |      |           |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,     Row,  F16,   F16, DsDataType,   F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
@@ -63,12 +66,13 @@ template <BlockGemmPipelineScheduler BlkGemmPipeSched,
           typename DsLayout   = ck::Tuple<>,
           typename DsDataType = ck::Tuple<>>
 using device_gemm_xdl_universal_reduce_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| DsLayout| CLayout|AData| BData|    DsData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |         |        | Type|  Type|      Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |         |        |     |      |          |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |         |        |     |      |          |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,    Row,  F16,   F16, DsDataType,  F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffleV3R1<     Row,     Row,  DsLayout,    Row,  F16,   F16, DsDataType,  F16,   F32,     F16,    PassThrough, PassThrough, PassThrough,       GemmSpec,    64,    16,   16,   128,   8,   4,  16,   16,    1,    1,     S<16, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 4>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
index 08746a52d7..e1612bcd24 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/CMakeLists.txt
@@ -64,6 +64,43 @@ list(APPEND GEMM_UNIVERSAL_STREAMK_INSTANCES
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v1_mnkpadding_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_default_instance.cpp
         device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_kpadding_instance.cpp
-        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp)
+        device_gemm_xdl_universal_streamk_f8_f16_f16/device_gemm_xdl_universal_streamk_f8_f16_f16_mk_nk_mn_mem_v2_mnkpadding_instance.cpp
+        
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v1_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_kn_mn_mem_v2_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_comp_mkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v1_mkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_km_nk_mn_mem_v2_mkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_mnpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v1_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_mem_v2_mnkpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v1_kpadding_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_default_instance.cpp
+        device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_v2_kpadding_instance.cpp)
 
 add_instance_library(device_gemm_universal_streamk_instance ${GEMM_UNIVERSAL_STREAMK_INSTANCES})
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
index 763ac4facf..2a02995827 100755
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn.hpp
@@ -48,7 +48,9 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_kn_mn_comp_instances =
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
index 7a59823d9a..425c2c0391 100755
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_bf16_bf16_bf16/device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn.hpp
@@ -51,11 +51,18 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_comp_instances =
         // AGPR Spill
         // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  16,   16,    8,    8,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          1,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         // AGPR Spill when use permuted lds layout. so, use padding for these two.
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 32, 1, 8>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               4,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+       
+        // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          0,          1,           1,                   S<1, 16, 1, 16>,               2,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+
     // clang-format on
     >;
 
@@ -85,6 +92,8 @@ using device_gemm_xdl_universal_streamk_bf16_bf16_bf16_mk_nk_mn_mem_instances =
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,   64,    64,   8,   8,  16,   16,    1,    2,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    16,  128,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+
+        // DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     BF16,   BF16,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   8,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              8,          0,          1,           1,                   S<1, 16, 1, 16>,              2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
index 5460f7f857..aa193417d9 100755
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn.hpp
@@ -34,12 +34,14 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(__gfx950__)
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   32,   128,  16,   2,  32,   32,    2,    1,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<32, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               4,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v2>
+#else        
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   4,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  32,   32,    4,    4,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -52,20 +54,23 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_comp_instances = st
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   256,    32,   8,   4,  32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   128,    32,   8,   4,  32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
+        DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   4,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   4,   4,  32,   32,    2,    2,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 16, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   2,   2,  32,   32,    2,    2,     S<32, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32,  8, 1>,    S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
+#endif // defined(__gfx950__)
     // clang-format on
     >;
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
-       // Latency friendly
+#if defined(__gfx950__)
+#else        
+        // Latency friendly
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   4,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
@@ -95,6 +100,7 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_kn_mn_mem_instances = std
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   8,   4,  16,   16,    1,    4,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    16,  256,    64,   4,   4,  16,   16,    1,    4,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              4,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              4,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Row,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,    32,  256,    64,   8,   4,  32,   32,    1,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,          0,          1,           1,                   S<1, 16, 1, 16>,              8,  BlkGemmPipeSched, BlockGemmPipelineVersion::v2>
+#endif // defined(__gfx950__)
     // clang-format on
     >;
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
index e716b3e85c..a685c4f252 100755
--- a/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal_streamk/device_gemm_xdl_universal_streamk_f16_f16_f16/device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn.hpp
@@ -34,12 +34,13 @@ static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
 template <GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Compute friendly
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   8,   8,  32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  32,   32,    4,    4,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
@@ -57,7 +58,9 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = st
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   4,   4,  16,   16,    8,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   256,    32,   2,   2,  16,   16,    8,    8,     S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<16,16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
      
+#if !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,   8,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
+#endif // !defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   256,   224,    64,   8,   8,  16,   16,    8,    7,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          2,           1,                   S<1, 64, 1, 4>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,   8,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -69,13 +72,14 @@ using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_comp_instances = st
 
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_streamk_f16_f16_f16_mk_nk_mn_mem_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|    Block-wiseGemm|               Block-wiseGemm|
         //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|          Pipeline|                     Pipeline|
         //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|         Scheduler|                     Verision|
         //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                  |                             |
-
-       // Latency friendly 
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
+        // Latency friendly 
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   8,   8,  16,   16,    1,    1,     S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<8, 16, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   4,   4,  16,   16,    1,    1,     S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,    S<16, 8, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              4,              4,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         DeviceGemm_Xdl_CShuffle_Streamk_V3<  Row,     Col,     Row,     F16,   F16,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   128,    32,   16,    64,   2,   2,  16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,    S<32, 4, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              2,              2,          0,          1,           1,                   S<1, 16, 1, 8>,               2,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
index 05eaf90677..ac393560b7 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/CMakeLists.txt
@@ -7,6 +7,25 @@ set(GROUPED_CONV2D_BWD_WEIGHT
     xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_f32_bf16_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev2_instance.cpp
@@ -19,6 +38,10 @@ set(GROUPED_CONV2D_BWD_WEIGHT
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_f16_pipev1_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev1_instance.cpp
     xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_ngchw_gkyxc_ngkhw_bf16_pipev1_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev2_irregular_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_f16_pipev5_irregular_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev2_irregular_instance.cpp
+    xdl/device_grouped_conv2d_bwd_weight_two_stage_xdl_nhwgc_gkyxc_nhwgk_bf16_pipev5_irregular_instance.cpp
     )
 
 if(DL_KERNELS)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
index 69e22dee41..abff222a4e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -25,7 +25,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_in
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_generic_instances<
             2,
             GNHWC,
             GKYXC,
@@ -34,7 +34,7 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_bf16_f32_bf16_in
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_generic_instances<
             2,
             GNHWC,
             GKYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
new file mode 100644
index 0000000000..5282d0da4c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instance.cpp
@@ -0,0 +1,38 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_default_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       2,
+                                       GNHWC,
+                                       GKYXC,
+                                       GNHWK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v1>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
index abb90b7c71..57df822021 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -25,19 +25,20 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<2,
-                                                                   GNHWC,
-                                                                   GKYXC,
-                                                                   GNHWK,
-                                                                   ConvBwdWeightDefault>{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_generic_instances<2,
+                                                                           GNHWC,
+                                                                           GKYXC,
+                                                                           GNHWK,
+                                                                           ConvBwdWeightDefault>{});
     // 2. Filter1x1Stride1Pad0
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
-                                       2,
-                                       GNHWC,
-                                       GKYXC,
-                                       GNHWK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_generic_instances<
+            2,
+            GNHWC,
+            GKYXC,
+            GNHWK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
new file mode 100644
index 0000000000..715789747d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f16_pad0_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       2,
+                                       GNHWC,
+                                       GKYXC,
+                                       GNHWK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
new file mode 100644
index 0000000000..b93a4d0d97
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_default_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       2,
+                                       GNHWC,
+                                       GKYXC,
+                                       GNHWK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
index 2e0fef9cf8..c899f34d51 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -25,19 +25,20 @@ void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_instances(
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<2,
-                                                                   GNHWC,
-                                                                   GKYXC,
-                                                                   GNHWK,
-                                                                   ConvBwdWeightDefault>{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances<2,
+                                                                           GNHWC,
+                                                                           GKYXC,
+                                                                           GNHWK,
+                                                                           ConvBwdWeightDefault>{});
     // 2. Filter1x1Stride1Pad0
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
-                                       2,
-                                       GNHWC,
-                                       GKYXC,
-                                       GNHWK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances<
+            2,
+            GNHWC,
+            GKYXC,
+            GNHWK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
new file mode 100644
index 0000000000..026ab88158
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[g, n, hi, wi, c] * wei[g, k, y, x, c] = out[g, n, ho, wo, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_gnhwc_gkyxc_gnhwk_f32_pad0_pipev1_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           GNHWC,
+                                                           GKYXC,
+                                                           GNHWK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       2,
+                                       GNHWC,
+                                       GKYXC,
+                                       GNHWK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v1>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
new file mode 100644
index 0000000000..2c494f22ff
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
+                                                                    NGCHW,
+                                                                    GKYXC,
+                                                                    NGKHW,
+                                                                    ConvBwdWeightDefault,
+                                                                    1,
+                                                                    1>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<2,
+                                                                    NGCHW,
+                                                                    GKYXC,
+                                                                    NGKHW,
+                                                                    ConvBwdWeightDefault,
+                                                                    4,
+                                                                    4>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
new file mode 100644
index 0000000000..911751b0b9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<2,
+                                                                   NGCHW,
+                                                                   GKYXC,
+                                                                   NGKHW,
+                                                                   ConvBwdWeightDefault,
+                                                                   1,
+                                                                   1>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<2,
+                                                                   NGCHW,
+                                                                   GKYXC,
+                                                                   NGKHW,
+                                                                   ConvBwdWeightDefault,
+                                                                   4,
+                                                                   4>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
new file mode 100644
index 0000000000..3b1fdd9be7
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_ngchw_gkyxc_ngkhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NGCHW,
+                                                           GKYXC,
+                                                           NGKHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<2,
+                                                                   NGCHW,
+                                                                   GKYXC,
+                                                                   NGKHW,
+                                                                   ConvBwdWeightDefault,
+                                                                   1,
+                                                                   1>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<2,
+                                                                   NGCHW,
+                                                                   GKYXC,
+                                                                   NGKHW,
+                                                                   ConvBwdWeightDefault,
+                                                                   4,
+                                                                   4>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
new file mode 100644
index 0000000000..9e717ea9c1
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
new file mode 100644
index 0000000000..2081bc545f
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
new file mode 100644
index 0000000000..42927a91e8
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
new file mode 100644
index 0000000000..15366961f3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_bf16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
new file mode 100644
index 0000000000..b38779de9d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+    ;
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
new file mode 100644
index 0000000000..18a6d326b9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instance.cpp
@@ -0,0 +1,40 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+    ;
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
new file mode 100644
index 0000000000..485eb85199
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
new file mode 100644
index 0000000000..fae04b5718
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
new file mode 100644
index 0000000000..ba82f403fb
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
new file mode 100644
index 0000000000..4e2d37576d
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
new file mode 100644
index 0000000000..f99c644618
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
new file mode 100644
index 0000000000..3be56da25b
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_weight/xdl/device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv2d_bwd_weight_xdl_nhwgc_gkyxc_nhwgk_f32_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<2,
+                                                           NHWGC,
+                                                           GKYXC,
+                                                           NHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       2,
+                                       NHWGC,
+                                       GKYXC,
+                                       NHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
index 98bee66a91..146916cfd3 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv2d_fwd/CMakeLists.txt
@@ -11,6 +11,7 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
    xdl/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_instance.cpp
@@ -27,6 +28,7 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_f32_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_nhwgc_gkyxc_nhwgk_int8_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_bf16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f16_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_f32_instance.cpp
    xdl/merged_groups/device_grouped_conv2d_fwd_xdl_merged_groups_ngchw_gkyxc_ngkhw_int8_instance.cpp
@@ -42,10 +44,12 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_mem_inter_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_intra_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_intra_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_mem_inter_instance.cpp
    xdl/mem/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_mem_inter_instance.cpp
@@ -56,6 +60,7 @@ add_instance_library(device_grouped_conv2d_fwd_instance
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_nhwgc_gkyxc_nhwgk_int8_comp_instance.cpp
    # NGCHW, GKYXC, NGKHW
+   xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_bf16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f16_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_f32_comp_instance.cpp
    xdl/comp/device_grouped_conv2d_fwd_xdl_ngchw_gkyxc_ngkhw_int8_comp_instance.cpp
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
index cf4e323bfe..860e08cafe 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/CMakeLists.txt
@@ -7,18 +7,37 @@ set(GROUPED_CONV3D_BWD_WEIGHT
      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_instance.cpp
      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_f32_bf16_instance.cpp
      xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
-    xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev2_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev5_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev1_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_f16_pipev1_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev1_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_pipev1_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev2_irregular_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pipev5_irregular_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev2_irregular_instance.cpp
+     xdl/device_grouped_conv3d_bwd_weight_two_stage_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pipev5_irregular_instance.cpp
     )
 
 if(DL_KERNELS)
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
index 81d64344f7..752f8a8157 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -24,7 +24,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_generic_instances<
             3,
             GNDHWC,
             GKZYXC,
@@ -33,7 +33,7 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_bf16_f32_bf16
     // 2. Filter1x1Stride1Pad0
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_instances<
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_f32_bf16_generic_instances<
             3,
             GNDHWC,
             GKZYXC,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
index d03f0a7bac..4137fcb536 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -24,19 +24,20 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f16_instances
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<3,
-                                                                   GNDHWC,
-                                                                   GKZYXC,
-                                                                   GNDHWK,
-                                                                   ConvBwdWeightDefault>{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_generic_instances<3,
+                                                                           GNDHWC,
+                                                                           GKZYXC,
+                                                                           GNDHWK,
+                                                                           ConvBwdWeightDefault>{});
     // 2. Filter1x1Stride1Pad0
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<
-                                       3,
-                                       GNDHWC,
-                                       GKZYXC,
-                                       GNDHWK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_generic_instances<
+            3,
+            GNDHWC,
+            GKZYXC,
+            GNDHWK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
index 7c24cc8fd4..d838a4e821 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instance.cpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
 #include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
@@ -24,19 +24,20 @@ void add_device_grouped_conv3d_bwd_weight_xdl_gndhwc_gkzyxc_gndhwk_f32_instances
     // 1. Default
     add_device_operation_instances(
         instances,
-        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
-                                                                   GNDHWC,
-                                                                   GKZYXC,
-                                                                   GNDHWK,
-                                                                   ConvBwdWeightDefault>{});
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances<3,
+                                                                           GNDHWC,
+                                                                           GKZYXC,
+                                                                           GNDHWK,
+                                                                           ConvBwdWeightDefault>{});
     // 2. Filter1x1Stride1Pad0
-    add_device_operation_instances(instances,
-                                   device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<
-                                       3,
-                                       GNDHWC,
-                                       GKZYXC,
-                                       GNDHWK,
-                                       ConvBwdWeightFilter1x1Stride1Pad0>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_generic_instances<
+            3,
+            GNDHWC,
+            GKZYXC,
+            GNDHWK,
+            ConvBwdWeightFilter1x1Stride1Pad0>{});
 }
 
 } // namespace instance
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
new file mode 100644
index 0000000000..553d3bfc36
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
new file mode 100644
index 0000000000..3cadf1d99c
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
new file mode 100644
index 0000000000..71e35c86a9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
new file mode 100644
index 0000000000..78a87d060e
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_bf16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_bf16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
new file mode 100644
index 0000000000..18d6871201
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
new file mode 100644
index 0000000000..14d01f0702
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
new file mode 100644
index 0000000000..ab0f977258
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
new file mode 100644
index 0000000000..bdc2248ed0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f16_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f16_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
new file mode 100644
index 0000000000..ea3b9264b9
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
new file mode 100644
index 0000000000..1bcde328b0
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_default_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightDefault,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
new file mode 100644
index 0000000000..6c4c53e5f3
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev2_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v2>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
new file mode 100644
index 0000000000..97cc06cbad
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instance.cpp
@@ -0,0 +1,39 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_v3_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ndhwgc_gkzyxc_ndhwgk_f32_pad0_pipev5_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NDHWGC,
+                                                           GKZYXC,
+                                                           NDHWGK,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    add_device_operation_instances(instances,
+                                   device_grouped_conv_bwd_weight_v3_xdl_c_shuffle_f32_instances<
+                                       3,
+                                       NDHWGC,
+                                       GKZYXC,
+                                       NDHWGK,
+                                       ConvBwdWeightFilter1x1Stride1Pad0,
+                                       BlockGemmPipelineScheduler::Intrawave,
+                                       BlockGemmPipelineVersion::v5>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
new file mode 100644
index 0000000000..fd558f70ca
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           BF16,
+                                                           BF16,
+                                                           BF16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
+                                                                    NGCDHW,
+                                                                    GKZYXC,
+                                                                    NGKDHW,
+                                                                    ConvBwdWeightDefault,
+                                                                    1,
+                                                                    1>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_bf16_instances<3,
+                                                                    NGCDHW,
+                                                                    GKZYXC,
+                                                                    NGKDHW,
+                                                                    ConvBwdWeightDefault,
+                                                                    4,
+                                                                    4>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
new file mode 100644
index 0000000000..945e6fa563
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           F16,
+                                                           F16,
+                                                           F16,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<3,
+                                                                   NGCDHW,
+                                                                   GKZYXC,
+                                                                   NGKDHW,
+                                                                   ConvBwdWeightDefault,
+                                                                   1,
+                                                                   1>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f16_instances<3,
+                                                                   NGCDHW,
+                                                                   GKZYXC,
+                                                                   NGKDHW,
+                                                                   ConvBwdWeightDefault,
+                                                                   4,
+                                                                   4>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
new file mode 100644
index 0000000000..b7e6454062
--- /dev/null
+++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_weight/xdl/device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
@@ -0,0 +1,49 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+#include "ck/library/tensor_operation_instance/gpu/grouped_conv_bwd_weight/device_grouped_conv_bwd_weight_xdl_instance.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+// Compilation parameters for in[n, hi, wi, g, c] * wei[g, k, y, x, c] = out[n, ho, wo, g, k]
+void add_device_grouped_conv3d_bwd_weight_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instances(
+    std::vector<std::unique_ptr<DeviceGroupedConvBwdWeight<3,
+                                                           NGCDHW,
+                                                           GKZYXC,
+                                                           NGKDHW,
+                                                           F32,
+                                                           F32,
+                                                           F32,
+                                                           PassThrough,
+                                                           PassThrough,
+                                                           PassThrough>>>& instances)
+{
+    // 1. Default
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
+                                                                   NGCDHW,
+                                                                   GKZYXC,
+                                                                   NGKDHW,
+                                                                   ConvBwdWeightDefault,
+                                                                   1,
+                                                                   1>{});
+    add_device_operation_instances(
+        instances,
+        device_grouped_conv_bwd_weight_xdl_c_shuffle_f32_instances<3,
+                                                                   NGCDHW,
+                                                                   GKZYXC,
+                                                                   NGKDHW,
+                                                                   ConvBwdWeightDefault,
+                                                                   4,
+                                                                   4>{});
+}
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp
index f1c1cfc391..0bd53706be 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm/device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_instance.cpp
@@ -29,13 +29,14 @@ using Empty_Tuple = ck::Tuple<>;
 
 using PassThrough                    = ck::tensor_operation::element_wise::PassThrough;
 static constexpr auto GemmMNKPadding = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
-
 using device_grouped_gemm_xdl_splitk_f16_f8_f16_mk_kn_mn_irregular_tile_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|           C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise| Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|   Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //################################|       |       |            |       |      |      |        |         |            |      |            |            |            |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,    F8,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,    F8,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   128,   256,    32,   8,   8,   32,   32,    2,    4,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 64, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              4,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
         DeviceGroupedGemmXdlSplitKCShuffle<    Row,    Row, Empty_Tuple,    Row,   F16,    F8,     F32,      F16, Empty_Tuple,   F16, PassThrough, PassThrough, PassThrough, GemmMNKPadding,        1,   256,   192,    64,    32,   8,   8,   32,   32,    3,    1,  S<1, 4, 64, 1>,  S<0, 2, 1, 3>,  S<0, 2, 1, 3>,              3,              8,              8,         1,  S<1, 4, 32, 1>,  S<0, 1, 3, 2>,  S<0, 1, 3, 2>,             2,              2,              8,         1,           1,           1,               S<1, 32, 1, 8>,              8, PipelineVersion::v1>,
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
index 1ee3d0add4..bc9c711d3a 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_fixed_nk/CMakeLists.txt
@@ -8,6 +8,8 @@ list(APPEND GROUPED_GEMM_FIXED_NK_INSTANCES device_grouped_gemm_xdl_fixed_nk_f16
                                             device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_kn_mn_instance.cpp
                                             device_grouped_gemm_xdl_fixed_nk_f16_i8_f16_mk_nk_mn_instance.cpp
                                             device_grouped_gemm_xdl_fixed_nk_bf16_i8_bf16_mk_kn_mn_instance.cpp
-                                            device_grouped_gemm_xdl_fixed_nk_bf16_i8_bf16_mk_nk_mn_instance.cpp)
+                                            device_grouped_gemm_xdl_fixed_nk_bf16_i8_bf16_mk_nk_mn_instance.cpp
+                                            device_grouped_gemm_xdl_fixed_nk_bf16_bf16_bf16_mk_kn_mn_instance.cpp
+                                            device_grouped_gemm_xdl_fixed_nk_bf16_bf16_bf16_mk_nk_mn_instance.cpp)
 
-add_instance_library(device_grouped_gemm_fixed_nk_instance ${GROUPED_GEMM_FIXED_NK_INSTANCES})
+add_instance_library(device_grouped_gemm_fixed_nk_instance ${GROUPED_GEMM_FIXED_NK_INSTANCES})
\ No newline at end of file
diff --git a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp
index d943376a37..baf04cf12e 100644
--- a/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/grouped_gemm_tile_loop/device_grouped_gemm_xdl_tile_loop_multiply_bf16_i8_bf16_mk_kn_mn.hpp
@@ -44,11 +44,14 @@ template <typename DsLayout,
           typename CDEElementwiseOp,
           GemmSpecialization GemmSpec = GemmMNKPadding>
 using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_comp_instances = std::tuple<
-    // clang-format off
+// clang-format off
         //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //###########################################|       |       |            |       |      |      |        |         |            |      |            |            |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |   S<C,D0...,D_N| 
+#if defined(CK_USE_AMD_MFMA_GFX950)
+        DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    128,  16,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,8>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   128,   128,    64,   8,   4,   32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,    S<16, 16, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   256,   256,   256,    32,   8,   4,   32,   32,    4,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<8, 32, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,              8,              4,         0,           1,           1,               S<1, 32, 1, 8>,        S<8,8,1>,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
@@ -67,11 +70,13 @@ template <typename DsLayout,
           BlockGemmPipelineScheduler BlkGemmPipeSched = BlockGemmPipelineScheduler::Intrawave>
 using device_grouped_gemm_xdl_tile_loop_bf16_i8_bf16_mk_kn_mn_mem_instances =
     std::tuple<
-        // clang-format off
+// clang-format off
         //###########################################|      A|      B|          Ds|      E| AData| BData| AccData| CShuffle|      DsData| EData|           A|           B|                C|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
         //###########################################| Layout| Layout|      Layout| Layout|  Type|  Type|    Type| DataType|        Type|  Type| Elementwise| Elementwise|      Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
         //###########################################|       |       |            |       |      |      |        |         |            |      |   Operation|   Operation|        Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
         //###########################################|       |       |            |       |      |      |        |         |            |      |            |            |                 |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |   S<C,D0...,D_N| 
+#if defined(CK_USE_AMD_MFMA_GFX950)
+#endif // defined(CK_USE_AMD_MFMA_GFX950)
         // Latency friendly
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,    64,    16,    16,   256,   8,   4,   16,   16,    1,    1,     S<32, 2, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 1, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 4>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
         // DeviceGroupedGemmMultipleDXdlCShuffleTileLoop<   Row,     Row,    DsLayout,    Row,  BF16,    I8,     F32,      F32,  DsDataType,  BF16, PassThrough, PassThrough, CDEElementwiseOp,       GemmSpec,        1,   128,    16,    32,   256,   8,   4,   16,   16,    1,    1,     S<32, 4, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         0,     S<64, 2, 1>,     S<0, 2, 1>,     S<0, 2, 1>,             1,             16,              4,          0,          1,           1,               S<1, 16, 1, 8>,        S<4,4,1>,  BlkGemmPipeSched, BlockGemmPipelineVersion::v1>,
diff --git a/library/src/utility/CMakeLists.txt b/library/src/utility/CMakeLists.txt
index 296e6c993a..28883efef0 100644
--- a/library/src/utility/CMakeLists.txt
+++ b/library/src/utility/CMakeLists.txt
@@ -1,4 +1,4 @@
-add_library(utility STATIC
+add_library(utility
     device_memory.cpp
     host_tensor.cpp
     convolution_parameter.cpp
@@ -6,6 +6,10 @@ add_library(utility STATIC
 
 add_library(composable_kernel::utility ALIAS utility)
 set_target_properties(utility PROPERTIES POSITION_INDEPENDENT_CODE ON)
+set_target_properties(utility
+    PROPERTIES
+    VERSION ${CMAKE_PROJECT_VERSION}
+    SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR})
 target_compile_options(utility PRIVATE ${CMAKE_COMPILER_WARNINGS})
 target_include_directories(utility PUBLIC
     "$<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/ck>"
diff --git a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
index 084274e11f..dbfddeb8a4 100644
--- a/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_multiply_multiply_impl.hpp
@@ -189,7 +189,6 @@ bool profile_gemm_multiply_multiply_impl(int do_verification,
     for(auto& op_ptr : op_ptrs)
     {
         // Seems like when performance measurement has bug when spiltK is large
-        // std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
         std::vector<int> kbatch_list = {1, 2, 4, 8, 16};
 
         if(KBatch > 0)
diff --git a/profiler/include/profiler/profile_gemm_universal_impl.hpp b/profiler/include/profiler/profile_gemm_universal_impl.hpp
index 30f0da212d..2054ffbbb3 100644
--- a/profiler/include/profiler/profile_gemm_universal_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2023-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -65,11 +65,13 @@ bool profile_gemm_universal_impl(int do_verification,
 
     Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
     Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<BDataType> b_k_n_permute(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
     Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
     Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
 
-    int total_gemm_needed = a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes();
-    int rotating_count    = std::max(
+    std::size_t total_gemm_needed =
+        a_m_k.GetElementSpaceSizeInBytes() + b_k_n.GetElementSpaceSizeInBytes();
+    int rotating_count = std::max(
         1,
         std::min(n_iter,
                  static_cast<int>(std::ceil(static_cast<double>(rotating) / total_gemm_needed))));
@@ -86,9 +88,13 @@ bool profile_gemm_universal_impl(int do_verification,
         a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-1, 2});
         b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-1, 2});
         break;
-    default:
+    case 2:
         a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
         b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-2, 2});
     }
 
     using AElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -100,11 +106,10 @@ bool profile_gemm_universal_impl(int do_verification,
     const auto c_element_op = CElementOp{};
 
     DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpaceSize());
-    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpaceSize());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n_permute.mDesc.GetElementSpaceSize());
     DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpaceSize());
 
     a_device_buf.ToDevice(a_m_k.mData.data());
-    b_device_buf.ToDevice(b_k_n.mData.data());
 
     using DeviceOp = ck::tensor_operation::device::DeviceGemmV2<ALayout,
                                                                 BLayout,
@@ -153,6 +158,84 @@ bool profile_gemm_universal_impl(int do_verification,
     // profile device GEMM instances
     for(auto& op_ptr : op_ptrs)
     {
+        const int KPerBlock = op_ptr->GetKPerBlock();
+
+        if(op_ptr->GetPermuteB())
+        {
+            int K1 = KPerBlock;
+            int K0 = K / KPerBlock;
+
+            // int K0, N, K1
+            for(int j = 0; j < K0; j++)
+            {
+                for(int i = 0; i < N; i++)
+                {
+                    for(int jj = 0; jj < K1; jj++)
+                    {
+                        b_k_n_permute(j * N * K1 + i * K1 + jj) = b_k_n(i * K + (j * K1 + jj));
+                    }
+                }
+            }
+
+            if constexpr(is_same_v<BDataType, pk_i4_t> && is_same_v<ADataType, half_t>)
+            {
+                // vector pk_i4x4 permute
+                for(int i = 0; i < N; i++)
+                {
+                    for(int j = 0; j < K; j += 8)
+                    {
+                        int input[8];
+
+                        for(int k = 0; k < 4; k++)
+                        {
+                            int i4x2         = b_k_n_permute(j + k * 2, i).data;
+                            input[k * 2 + 0] = (i4x2 >> 4) & 0xf;
+                            input[k * 2 + 1] = (i4x2 >> 0) & 0xf;
+                        }
+
+                        // permute 01234567->20643175
+                        {
+                            int hi   = input[2];
+                            int lo   = input[0];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 0, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[6];
+                            int lo   = input[4];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 2, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[3];
+                            int lo   = input[1];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 4, i) = i4x2;
+                        }
+
+                        {
+                            int hi   = input[7];
+                            int lo   = input[5];
+                            int i4x2 = (hi << 4) | lo;
+
+                            b_k_n_permute(j + 6, i) = i4x2;
+                        }
+                    }
+                }
+            }
+        }
+        else
+        {
+            b_k_n_permute = b_k_n;
+        }
+
+        b_device_buf.ToDevice(b_k_n_permute.mData.data());
+
         std::vector<int> kbatch_list = {1, 2, 4, 8, 16, 19, 32, 38};
 
         if(KBatch > 0)
@@ -240,7 +323,15 @@ bool profile_gemm_universal_impl(int do_verification,
 
                 std::size_t flop = std::size_t(2) * M * N * K;
 
-                std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                static constexpr index_t BPackedSize = []() {
+                    if constexpr(is_same_v<remove_cvref_t<BDataType>, pk_i4_t>)
+                        return 2;
+                    else
+                        return 1;
+                }();
+
+                std::size_t num_btype = sizeof(ADataType) * M * K +
+                                        sizeof(BDataType) * K * N / BPackedSize +
                                         sizeof(CDataType) * M * N;
 
                 float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 3758af2477..a13f79182e 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -1,5 +1,5 @@
 // SPDX-License-Identifier: MIT
-// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
 
 #pragma once
 
@@ -89,6 +89,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     in_device_buf.ToDevice(input.mData.data());
     out_device_buf.ToDevice(output.mData.data());
 
+    float max_accumulated_value = 0;
     if(do_verification)
     {
         auto ref_conv     = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
@@ -114,6 +115,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                                                   {});
 
         ref_invoker.Run(ref_argument);
+        max_accumulated_value =
+            *std::max_element(weight_host_result.mData.begin(), weight_host_result.mData.end());
     }
 
     using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
@@ -237,7 +240,39 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
                 {
                     wei_device_buf.FromDevice(weight_device_result.mData.data());
 
-                    bool pass = ck::utils::check_err(weight_device_result, weight_host_result);
+                    using ComputeType =
+                        std::conditional_t<sizeof(ComputeTypeA) < sizeof(ComputeTypeB),
+                                           ComputeTypeA,
+                                           ComputeTypeB>;
+                    using AccDataType =
+                        std::conditional_t<std::is_same_v<ComputeType, int8_t>, int32_t, float>;
+                    const index_t num_accums         = output.GetElementSize() / conv_param.K_;
+                    const index_t num_accums_split_k = split_k_list[split_k_id];
+                    // Calculate thresholds
+                    auto rtol =
+                        ck::utils::get_relative_threshold<ComputeType, WeiDataType, AccDataType>(
+                            num_accums / num_accums_split_k);
+                    auto atol =
+                        ck::utils::get_absolute_threshold<ComputeType, WeiDataType, AccDataType>(
+                            max_accumulated_value / num_accums_split_k,
+                            num_accums / num_accums_split_k);
+                    // Calculate error due to split_k accumulation
+                    auto rtol_split_k =
+                        ck::utils::get_relative_threshold<WeiDataType, WeiDataType, WeiDataType>(
+                            num_accums_split_k);
+                    auto atol_split_k =
+                        ck::utils::get_absolute_threshold<WeiDataType, WeiDataType, WeiDataType>(
+                            max_accumulated_value, num_accums_split_k);
+                    // Use higher threshold
+                    rtol      = std::max(rtol, rtol_split_k);
+                    atol      = std::max(atol, atol_split_k);
+                    bool pass = ck::utils::check_err(weight_device_result,
+                                                     weight_host_result,
+                                                     "Error: Incorrect results!",
+                                                     rtol,
+                                                     atol);
+                    std::cout << "Relative error threshold: " << rtol
+                              << " Absolute error threshold: " << atol << std::endl;
 
                     if(!pass)
                     {
diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt
index c6ac18fce5..5ed28b9826 100644
--- a/profiler/src/CMakeLists.txt
+++ b/profiler/src/CMakeLists.txt
@@ -1,88 +1,90 @@
 # ckProfiler
 set(PROFILER_SOURCES
     profiler.cpp
-    # profile_gemm.cpp
-    # profile_reduce.cpp
-    # profile_groupnorm_bwd_data.cpp
-    # profile_groupnorm_fwd.cpp
-    # profile_layernorm_bwd_data.cpp
-    # profile_layernorm_bwd_gamma_beta.cpp
-    # profile_groupnorm_bwd_gamma_beta.cpp
-    # profile_layernorm_fwd.cpp
-    # profile_max_pool2d_fwd.cpp
-    # profile_pool3d_fwd.cpp
-    # profile_avg_pool3d_bwd.cpp
-    # profile_max_pool3d_bwd.cpp
-    # profile_avg_pool2d_bwd.cpp
-    # profile_max_pool2d_bwd.cpp
-    # profile_softmax.cpp
-    # profile_batchnorm_fwd.cpp
-    # profile_batchnorm_bwd.cpp
-    # profile_batchnorm_infer.cpp
-    # profile_conv_tensor_rearrange.cpp
-    # profile_transpose.cpp
-    # profile_permute_scale.cpp
+    profile_gemm.cpp
+    profile_reduce.cpp
+    profile_groupnorm_bwd_data.cpp
+    profile_groupnorm_fwd.cpp
+    profile_layernorm_bwd_data.cpp
+    profile_layernorm_bwd_gamma_beta.cpp
+    profile_groupnorm_bwd_gamma_beta.cpp
+    profile_layernorm_fwd.cpp
+    profile_max_pool2d_fwd.cpp
+    profile_pool3d_fwd.cpp
+    profile_avg_pool3d_bwd.cpp
+    profile_max_pool3d_bwd.cpp
+    profile_avg_pool2d_bwd.cpp
+    profile_max_pool2d_bwd.cpp
+    profile_softmax.cpp
+    profile_batchnorm_fwd.cpp
+    profile_batchnorm_bwd.cpp
+    profile_batchnorm_infer.cpp
+    profile_conv_tensor_rearrange.cpp
+    profile_transpose.cpp
+    profile_permute_scale.cpp
 )
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
-  # if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
-  #   list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
-  # endif()
-  # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_reduce.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_grouped_gemm_two_stage.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
-  # endif()
-  # list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
-  # if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
-    # list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
+  if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+    list(APPEND PROFILER_SOURCES profile_contraction_bilinear.cpp)
+    list(APPEND PROFILER_SOURCES profile_contraction_scale.cpp)
+  endif()
+  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    list(APPEND PROFILER_SOURCES profile_gemm_reduce.cpp)
+    list(APPEND PROFILER_SOURCES profile_batched_gemm_gemm.cpp)
+    list(APPEND PROFILER_SOURCES profile_batched_gemm_add_relu_gemm_add.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_add.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_add_add_fastgelu.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_add_fastgelu.cpp)
+    list(APPEND PROFILER_SOURCES profile_grouped_gemm.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_streamk.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_fastgelu.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_add_relu.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_add_silu.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_add_relu_add_layernorm.cpp)
+    list(APPEND PROFILER_SOURCES profile_grouped_gemm_fixed_nk.cpp)
+    list(APPEND PROFILER_SOURCES profile_grouped_gemm_fastgelu.cpp)
+    list(APPEND PROFILER_SOURCES profile_grouped_gemm_tile_loop.cpp)
+    list(APPEND PROFILER_SOURCES profile_grouped_gemm_multiply_tile_loop.cpp)
+  endif()
+  list(APPEND PROFILER_SOURCES profile_gemm_multiply_add.cpp)
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+    list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply.cpp)
     list(APPEND PROFILER_SOURCES profile_gemm_multiply_multiply_weight_preshuffle.cpp)
-  #   list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
-  # endif()
-  # list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
-  # list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
-  # list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
-  # list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
-  # list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
-  # list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
-  # list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
-  # list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
-  # list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
-  # list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp)
-  # list(APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp)
-  # list(APPEND PROFILER_SOURCES profile_conv_fwd.cpp)
-  # list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd_outelementop.cpp)
+    list(APPEND PROFILER_SOURCES profile_gemm_ab_scale.cpp)
+  endif()
+  list(APPEND PROFILER_SOURCES profile_batched_gemm.cpp)
+  list(APPEND PROFILER_SOURCES profile_batched_gemm_reduce.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_add_multiply.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_bias_add_reduce.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_splitk.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_universal.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_b_scale.cpp)
+  list(APPEND PROFILER_SOURCES profile_batched_gemm_b_scale.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_universal_batched.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_universal_reduce.cpp)
+  list(APPEND PROFILER_SOURCES profile_gemm_universal_streamk.cpp)
+  list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu.cpp)
+  list(APPEND PROFILER_SOURCES profile_conv_fwd_bias_relu_add.cpp)
+  list(APPEND PROFILER_SOURCES profile_conv_bwd_data.cpp)
+  list(APPEND PROFILER_SOURCES profile_conv_fwd.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd_outelementop.cpp)
 
 endif()
 
-# if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
-#   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-#     list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
-#   endif()
-#   list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp)
-#   list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp)
-#   list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
-# endif()
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12" OR SUPPORTED_GPU_TARGETS MATCHES "gfx9")
+  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    list(APPEND PROFILER_SOURCES profile_gemm_bilinear.cpp)
+  endif()
+  list(APPEND PROFILER_SOURCES profile_grouped_conv_fwd.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_data.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
+endif()
 
-# if(DL_KERNELS)
-#   list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
-#   list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
-# endif()
+if(DL_KERNELS)
+  list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp)
+  list(APPEND PROFILER_SOURCES profile_grouped_conv_bwd_weight.cpp)
+endif()
 
 set(PROFILER_EXECUTABLE ckProfiler)
 
@@ -95,88 +97,90 @@ if(NOT WIN32 AND ${hip_VERSION_FLAT} GREATER 600241132)
 endif()
 
 target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE utility getopt::getopt)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool2d_fwd_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool2d_bwd_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
-# target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_data_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_normalization_bwd_gamma_beta_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_softmax_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_reduce_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batchnorm_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool2d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_pool3d_fwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool2d_bwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_avg_pool3d_bwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_transpose_instance)
+target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_permute_scale_instance)
 
 if(SUPPORTED_GPU_TARGETS MATCHES "gfx9")
-  # if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
-  # endif()
-  # if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_tile_loop_instance)
-  # endif()
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
-  # if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
-    # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
+  if(DTYPES MATCHES "fp32" OR DTYPES MATCHES "fp64" OR NOT DEFINED DTYPES)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_bilinear_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_contraction_scale_instance)
+  endif()
+  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_add_fastgelu_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_fastgelu_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_gemm_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_add_relu_gemm_add_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_streamk_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_fastgelu_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_silu_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_relu_add_layernorm_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fixed_nk_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_fastgelu_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_gemm_tile_loop_instance)
+  endif()
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_reduce_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_add_instance)
+  if(SUPPORTED_GPU_TARGETS MATCHES "gfx94")
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_instance)
     target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_multiply_multiply_weight_preshuffle_instance)
-  #   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
-  # endif()
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convscale_instance)
-  # target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_ab_scale_instance)
+  endif()
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_splitk_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_b_scale_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_b_scale_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_batched_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_reduce_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_universal_streamk_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_add_multiply_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_reduce_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bias_add_reduce_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_fwd_bias_relu_add_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_fwd_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv1d_bwd_data_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv3d_bwd_data_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_conv2d_bwd_data_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convscale_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_convinvscale_instance)
 endif()
 
-# if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
-#   if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
-#     target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
-#   endif()
-#   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
-#   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
-#   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
-#   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
-#   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
-# endif()
-
-# if(DL_KERNELS)
-#   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
-#   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
-#   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
-#   target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
-# endif()
+if(SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR SUPPORTED_GPU_TARGETS MATCHES "gfx11" OR SUPPORTED_GPU_TARGETS MATCHES "gfx12")
+  if(DTYPES MATCHES "fp16" OR NOT DEFINED DTYPES)
+    target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_gemm_bilinear_instance)
+  endif()
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_fwd_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_fwd_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
+endif()
 
+if(DL_KERNELS)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv1d_bwd_weight_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_weight_instance)
+  target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_weight_instance)
+endif()
 rocm_install(TARGETS ${PROFILER_EXECUTABLE} COMPONENT profiler)
diff --git a/profiler/src/profile_gemm_multiply_multiply.cpp b/profiler/src/profile_gemm_multiply_multiply.cpp
index 9de67b10fd..ad2bb77544 100644
--- a/profiler/src/profile_gemm_multiply_multiply.cpp
+++ b/profiler/src/profile_gemm_multiply_multiply.cpp
@@ -27,8 +27,9 @@ enum struct GemmDataType
     F16_F8_F16,     // 5
     F16_F16_F16_F8, // 6
     F8_F8_BF16,     // 7
-    INT8_INT8_F16,  // 8
-    F8_F8_F16,      // 9
+    INT8_INT8_BF16, // 8
+    INT8_INT8_F16,  // 9
+    F8_F8_F16,      // 10
 };
 
 #define OP_NAME "gemm_multiply_multiply"
@@ -41,7 +42,7 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
                "f16->f8; 7: f8->bf16, "
-               "comp f8; 8: int8->f16; 9: f8->f16, comp f8;)\n");
+               "comp f8; 8: int8->bf16; 9: f8->f16, comp f8;)\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
         printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
@@ -172,6 +173,11 @@ int profile_gemm_multiply_multiply(int argc, char* argv[])
         return profile(
             F8{}, F8{}, F8{}, F32{}, F32{}, F32{}, F16{}, Row{}, Col{}, Row{}, Col{}, Row{});
     }
+    else if(data_type == GemmDataType::INT8_INT8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(
+            I8{}, I8{}, I8{}, I32{}, F32{}, F32{}, BF16{}, Row{}, Col{}, Row{}, Col{}, Row{});
+    }
     else if(data_type == GemmDataType::INT8_INT8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
         return profile(
diff --git a/profiler/src/profile_gemm_universal.cpp b/profiler/src/profile_gemm_universal.cpp
index 990cbd292e..a22d983da5 100644
--- a/profiler/src/profile_gemm_universal.cpp
+++ b/profiler/src/profile_gemm_universal.cpp
@@ -1,10 +1,10 @@
 // SPDX-License-Identifier: MIT
 // Copyright (c) 2023-2024, Advanced Micro Devices, Inc. All rights reserved.
 
+#include <cstdlib>
+#include <initializer_list>
 #include <iostream>
 #include <numeric>
-#include <initializer_list>
-#include <cstdlib>
 
 #include "profiler/profile_gemm_universal_impl.hpp"
 #include "profiler_operation_registry.hpp"
@@ -27,6 +27,8 @@ enum struct GemmDataType
     F16_F8_F16,     // 5
     F16_F16_F16_F8, // 6
     F8_F8_BF16,     // 7
+    F16_I4_F16,     // 8
+    BF16_I4_BF16,   // 9
 };
 
 #define OP_NAME "gemm_universal"
@@ -39,7 +41,7 @@ int profile_gemm_universal(int argc, char* argv[])
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: "
                "f16->f8; 7: f8->bf16, "
-               "comp f8)\n");
+               "comp f8; 8: f16@i4; 9: bf16@i4\n");
         printf("arg3: matrix layout (0: A[m, k] * B[k, n] = C[m, n];\n");
         printf("                     1: A[m, k] * B[n, k] = C[m, n];\n");
         printf("                     2: A[k, m] * B[k, n] = C[m, n];\n");
@@ -103,6 +105,7 @@ int profile_gemm_universal(int argc, char* argv[])
     using BF16 = ck::bhalf_t;
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     using F8 = ck::f8_t;
+    using I4 = ck::pk_i4_t;
 #endif
 
     using Row = ck::tensor_layout::gemm::RowMajor;
@@ -207,6 +210,14 @@ int profile_gemm_universal(int argc, char* argv[])
     {
         return profile(F8{}, F8{}, F8{}, F32{}, BF16{}, Row{}, Col{}, Row{});
     }
+    else if(data_type == GemmDataType::F16_I4_F16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(F16{}, I4{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_I4_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(BF16{}, I4{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
 #endif
     else
     {
diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp
old mode 100755
new mode 100644
index 85f6c25770..b0f66a0c73
--- a/profiler/src/profile_gemm_universal_streamk.cpp
+++ b/profiler/src/profile_gemm_universal_streamk.cpp
@@ -56,6 +56,26 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
         exit(1);
     }
 
+    int M;
+    int N;
+    int StrideA;
+    int StrideB;
+    // Analyze the unsupported matrix shapes, switch the M and N number
+    if(std::stoi(argv[9]) % 8 != 0 && std::stoi(argv[8]) % 8 == 0)
+    {
+        M       = std::stoi(argv[9]);
+        StrideA = std::stoi(argv[12]);
+        N       = std::stoi(argv[8]);
+        StrideB = std::stoi(argv[11]);
+    }
+    else
+    {
+        M       = std::stoi(argv[8]);
+        StrideA = std::stoi(argv[11]);
+        N       = std::stoi(argv[9]);
+        StrideB = std::stoi(argv[12]);
+    }
+
     const auto data_type       = static_cast<GemmDataType>(std::stoi(argv[2]));
     const auto layout          = static_cast<GemmMatrixLayout>(std::stoi(argv[3]));
     const bool do_verification = std::stoi(argv[4]);
@@ -63,12 +83,8 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
     const bool do_log          = std::stoi(argv[6]);
     const bool time_kernel     = std::stoi(argv[7]);
 
-    const int M = std::stoi(argv[8]);
-    const int N = std::stoi(argv[9]);
     const int K = std::stoi(argv[10]);
 
-    const int StrideA     = std::stoi(argv[11]);
-    const int StrideB     = std::stoi(argv[12]);
     const int StrideC     = std::stoi(argv[13]);
     const int Streamk_sel = std::stoi(argv[14]);
     const int Grid_size   = std::stoi(argv[15]);
@@ -83,8 +99,9 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
         rotating = std::stoull(argv[18]) * 1024 * 1024;
     }
 
-    using F32 = float;
-    using F16 = ck::half_t;
+    using F32  = float;
+    using F16  = ck::half_t;
+    using BF16 = ck::bhalf_t;
 
 #if defined(CK_USE_FP8_ON_UNSUPPORTED_ARCH) || defined(CK_USE_GFX94)
     using F8 = ck::f8_t;
@@ -165,6 +182,22 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
         return profile(F8{}, F16{}, F32{}, F16{}, Row{}, Col{}, Row{});
     }
 #endif
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Row{}, Col{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_KN_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Row{}, Row{});
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::KM_NK_MN)
+    {
+        return profile(BF16{}, BF16{}, F32{}, BF16{}, Col{}, Col{}, Row{});
+    }
     else
     {
         std::cout << "this data_type & layout is not implemented" << std::endl;
diff --git a/profiler/src/profile_grouped_gemm_fixed_nk.cpp b/profiler/src/profile_grouped_gemm_fixed_nk.cpp
index e33d798504..827866ce69 100644
--- a/profiler/src/profile_grouped_gemm_fixed_nk.cpp
+++ b/profiler/src/profile_grouped_gemm_fixed_nk.cpp
@@ -17,11 +17,11 @@ enum struct GemmMatrixLayout
 
 enum struct GemmDataType
 {
-    BF16_I8_BF16, // 0
-    F16_F16_F16,  // 1
-    F16_F8_F16,   // 2
-    F16_I8_F16,   // 3
-
+    BF16_I8_BF16,  // 0
+    F16_F16_F16,   // 1
+    F16_F8_F16,    // 2
+    F16_I8_F16,    // 3
+    BF16_BF16_BF16 // 4
 };
 
 #define OP_NAME "grouped_gemm_fixed_nk"
@@ -39,7 +39,6 @@ std::vector<int> argToIntArray(char* input)
     {
         out.push_back(std::stoi(item));
     }
-
     return out;
 }
 
@@ -83,14 +82,6 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
     const auto StrideCs = argToIntArray(argv[13]);
     const int kbatch    = argc >= 15 ? std::stoi(argv[14]) : 1;
 
-    using F32 = float;
-    using F16 = ck::half_t;
-#if defined(CK_ENABLE_FP8)
-    using F8 = ck::f8_t;
-#endif
-    using BF16 = ck::bhalf_t;
-    using I8   = int8_t;
-
     int n_warmup = 1;
     int n_iter   = 10;
     if(argc == 17)
@@ -99,61 +90,12 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
         n_iter   = std::stoi(argv[16]);
     }
 
-#if defined(CK_ENABLE_BF16) && defined(CK_ENABLE_INT8)
-    if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        ck::profiler::profile_grouped_gemm_fixed_nk_impl<BF16,
-                                                         I8,
-                                                         BF16,
-                                                         F32,
-                                                         ck::tensor_layout::gemm::RowMajor,
-                                                         ck::tensor_layout::gemm::RowMajor,
-                                                         ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-    else if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
-    {
-        ck::profiler::profile_grouped_gemm_fixed_nk_impl<BF16,
-                                                         I8,
-                                                         BF16,
-                                                         F32,
-                                                         ck::tensor_layout::gemm::RowMajor,
-                                                         ck::tensor_layout::gemm::ColumnMajor,
-                                                         ck::tensor_layout::gemm::RowMajor>(
-            do_verification,
-            init_method,
-            do_log,
-            time_kernel,
-            Ms,
-            Ns,
-            Ks,
-            StrideAs,
-            StrideBs,
-            StrideCs,
-            kbatch,
-            n_warmup,
-            n_iter);
-    }
-#endif
-#if defined(CK_ENABLE_FP16)
-    else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_KN_MN)
-    {
-        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
-                                                         F16,
-                                                         F16,
-                                                         F32,
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t,
+                                                         ck::half_t,
+                                                         ck::half_t,
+                                                         float,
                                                          ck::tensor_layout::gemm::RowMajor,
                                                          ck::tensor_layout::gemm::RowMajor,
                                                          ck::tensor_layout::gemm::RowMajor>(
@@ -173,10 +115,10 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
     }
     else if(data_type == GemmDataType::F16_F16_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
-                                                         F16,
-                                                         F16,
-                                                         F32,
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t,
+                                                         ck::half_t,
+                                                         ck::half_t,
+                                                         float,
                                                          ck::tensor_layout::gemm::RowMajor,
                                                          ck::tensor_layout::gemm::ColumnMajor,
                                                          ck::tensor_layout::gemm::RowMajor>(
@@ -194,14 +136,13 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
             n_warmup,
             n_iter);
     }
-#endif
-#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8)
+#if defined(CK_ENABLE_FP8)
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
-                                                         F8,
-                                                         F16,
-                                                         F32,
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t,
+                                                         ck::f8_t,
+                                                         ck::half_t,
+                                                         float,
                                                          ck::tensor_layout::gemm::RowMajor,
                                                          ck::tensor_layout::gemm::RowMajor,
                                                          ck::tensor_layout::gemm::RowMajor>(
@@ -221,10 +162,10 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
     }
     else if(data_type == GemmDataType::F16_F8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
-                                                         F8,
-                                                         F16,
-                                                         F32,
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t,
+                                                         ck::f8_t,
+                                                         ck::half_t,
+                                                         float,
                                                          ck::tensor_layout::gemm::RowMajor,
                                                          ck::tensor_layout::gemm::ColumnMajor,
                                                          ck::tensor_layout::gemm::RowMajor>(
@@ -242,14 +183,14 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
             n_warmup,
             n_iter);
     }
-#endif
-#if defined(CK_ENABLE_FP16) && defined(CK_ENABLE_INT8)
+#endif // CK_ENABLE_FP8
+#if defined(CK_ENABLE_INT8)
     else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_KN_MN)
     {
-        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
-                                                         I8,
-                                                         F16,
-                                                         F32,
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t,
+                                                         int8_t,
+                                                         ck::half_t,
+                                                         float,
                                                          ck::tensor_layout::gemm::RowMajor,
                                                          ck::tensor_layout::gemm::RowMajor,
                                                          ck::tensor_layout::gemm::RowMajor>(
@@ -269,10 +210,10 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
     }
     else if(data_type == GemmDataType::F16_I8_F16 && layout == GemmMatrixLayout::MK_NK_MN)
     {
-        ck::profiler::profile_grouped_gemm_fixed_nk_impl<F16,
-                                                         I8,
-                                                         F16,
-                                                         F32,
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::half_t,
+                                                         int8_t,
+                                                         ck::half_t,
+                                                         float,
                                                          ck::tensor_layout::gemm::RowMajor,
                                                          ck::tensor_layout::gemm::ColumnMajor,
                                                          ck::tensor_layout::gemm::RowMajor>(
@@ -286,11 +227,107 @@ int profile_grouped_gemm_fixed_nk(int argc, char* argv[])
             StrideAs,
             StrideBs,
             StrideCs,
-            1,
+            kbatch,
             n_warmup,
             n_iter);
     }
-#endif
+#endif // CK_ENABLE_INT8
+#if defined(CK_ENABLE_BF16)
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::bhalf_t,
+                                                         ck::bhalf_t,
+                                                         ck::bhalf_t,
+                                                         float,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+    else if(data_type == GemmDataType::BF16_BF16_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::bhalf_t,
+                                                         ck::bhalf_t,
+                                                         ck::bhalf_t,
+                                                         float,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::ColumnMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+#if defined(CK_ENABLE_INT8)
+    else if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_KN_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::bhalf_t,
+                                                         int8_t,
+                                                         ck::bhalf_t,
+                                                         float,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+    else if(data_type == GemmDataType::BF16_I8_BF16 && layout == GemmMatrixLayout::MK_NK_MN)
+    {
+        ck::profiler::profile_grouped_gemm_fixed_nk_impl<ck::bhalf_t,
+                                                         int8_t,
+                                                         ck::bhalf_t,
+                                                         float,
+                                                         ck::tensor_layout::gemm::RowMajor,
+                                                         ck::tensor_layout::gemm::ColumnMajor,
+                                                         ck::tensor_layout::gemm::RowMajor>(
+            do_verification,
+            init_method,
+            do_log,
+            time_kernel,
+            Ms,
+            Ns,
+            Ks,
+            StrideAs,
+            StrideBs,
+            StrideCs,
+            kbatch,
+            n_warmup,
+            n_iter);
+    }
+#endif // CK_ENABLE_INT8
+#endif // CK_ENABLE_BF16
     else
     {
         throw std::runtime_error("wrong! this GEMM data_type & layout is not implemented");
diff --git a/test/grouped_convnd_bwd_weight/CMakeLists.txt b/test/grouped_convnd_bwd_weight/CMakeLists.txt
index 313b5ba4ca..063e0248e7 100644
--- a/test/grouped_convnd_bwd_weight/CMakeLists.txt
+++ b/test/grouped_convnd_bwd_weight/CMakeLists.txt
@@ -9,6 +9,10 @@ add_gtest_executable(test_grouped_convnd_bwd_weight_interface_xdl test_grouped_c
 if(result EQUAL 0)
    target_link_libraries(test_grouped_convnd_bwd_weight_interface_xdl PRIVATE utility)
 endif()
+add_gtest_executable(test_grouped_convnd_bwd_weight_v3_interface_xdl test_grouped_convnd_bwd_weight_v3_interface_xdl.cpp)
+if(result EQUAL 0)
+   target_link_libraries(test_grouped_convnd_bwd_weight_v3_interface_xdl PRIVATE utility)
+endif()
 add_gtest_executable(test_grouped_convnd_bwd_weight_interface_wmma test_grouped_convnd_bwd_weight_interface_wmma.cpp)
 if(result EQUAL 0)
    target_link_libraries(test_grouped_convnd_bwd_weight_interface_wmma PRIVATE utility)
diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_v3_interface_xdl.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_v3_interface_xdl.cpp
new file mode 100644
index 0000000000..1556f15898
--- /dev/null
+++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_v3_interface_xdl.cpp
@@ -0,0 +1,179 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <cstdlib>
+#include <iostream>
+#include <initializer_list>
+#include <tuple>
+#include <vector>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/convolution_backward_weight_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp"
+
+#include "ck/library/utility/convolution_parameter.hpp"
+#include "ck/library/utility/algorithm.hpp"
+#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
+
+#include <gtest/gtest.h>
+
+using F16         = ck::half_t;
+using F32         = float;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using ConvolutionBackwardWeightSpecialization =
+    ck::tensor_operation::device::ConvolutionBackwardWeightSpecialization;
+
+static constexpr auto ConvBwdWeightDefault = ConvolutionBackwardWeightSpecialization::Default;
+static constexpr auto Filter1x1Stride1Pad0 =
+    ConvolutionBackwardWeightSpecialization::Filter1x1Stride1Pad0;
+
+template <typename Tuple, ConvolutionBackwardWeightSpecialization ConvSpec>
+class TestGroupedConvndBwdWeight : public ::testing::Test
+{
+    protected:
+    static constexpr ck::index_t NDimSpatial = 2;
+
+    using InLayout  = std::tuple_element_t<2, Tuple>;
+    using WeiLayout = std::tuple_element_t<1, Tuple>;
+    using OutLayout = std::tuple_element_t<0, Tuple>;
+
+    // clang-format off
+    using GroupedConvBwdWeightDeviceInstance = ck::tensor_operation::device::DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
+        //##########|     Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| AccData|          In|         Wei|         Out|                      ConvBackward| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|   ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|   CBlockTransfer|  CBlockTransfer|
+        //##########|     Dim|         |          |          |   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|                            Weight|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|    ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|   ClusterLengths| ScalarPerVector|
+        //##########| Spatial|         |          |          |       |        |        |        |   Operation|   Operation|   Operation|                    Specialization|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|     ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| MBlock_MPerBlock|    NWaveNPerXdl|
+        //##########|        |         |          |          |       |        |        |        |            |            |            |                                  |      |      |      |      |   |     |     |     |     |                |                 |               |               |               |               |          |                |               |               |              |               |               |          |            |            | NBlock_NPerBlock|                |
+        <         NDimSpatial,  InLayout, WeiLayout,OutLayout,    F16,     F16,     F16,     F32, PassThrough, PassThrough, PassThrough,                  ConvSpec,   128,    32,   128,     32,  8,   32,   32,    1,    2,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,              2,              8,              1,      true,  S<4, 4,  1>,  S<2, 0, 1>,  S<1, 0, 2>,             2,              8,              4,      true,           1,           1,   S<1, 32, 1, 4>,               8>;
+    // clang-format on
+
+    ck::utils::conv::ConvParam conv_param;
+    ck::index_t split_k{2};
+
+    template <ck::index_t NDimSpatial>
+    bool Run()
+    {
+
+        const auto in_g_n_c_wis_desc =
+            ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(
+                conv_param);
+
+        const auto wei_g_k_c_xs_desc =
+            ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(
+                conv_param);
+
+        const auto out_g_n_k_wos_desc =
+            ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(
+                conv_param);
+
+        std::array<ck::index_t, NDimSpatial + 3> input_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> filter_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> output_lengths{};
+        std::array<ck::index_t, NDimSpatial + 3> input_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> weights_strides{};
+        std::array<ck::index_t, NDimSpatial + 3> output_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_strides{};
+        std::array<ck::index_t, NDimSpatial> conv_filter_dilations{};
+        std::array<ck::index_t, NDimSpatial> input_left_pads{};
+        std::array<ck::index_t, NDimSpatial> input_right_pads{};
+
+        auto range_copy = [](const auto& from, auto to) { std::copy(begin(from), end(from), to); };
+
+        range_copy(in_g_n_c_wis_desc.GetLengths(), begin(input_lengths));
+        range_copy(in_g_n_c_wis_desc.GetStrides(), begin(input_strides));
+        range_copy(wei_g_k_c_xs_desc.GetLengths(), begin(filter_lengths));
+        range_copy(wei_g_k_c_xs_desc.GetStrides(), begin(weights_strides));
+        range_copy(out_g_n_k_wos_desc.GetLengths(), begin(output_lengths));
+        range_copy(out_g_n_k_wos_desc.GetStrides(), begin(output_strides));
+        range_copy(conv_param.conv_filter_strides_, begin(conv_filter_strides));
+        range_copy(conv_param.conv_filter_dilations_, begin(conv_filter_dilations));
+        range_copy(conv_param.input_left_pads_, begin(input_left_pads));
+        range_copy(conv_param.input_right_pads_, begin(input_right_pads));
+
+        auto conv = GroupedConvBwdWeightDeviceInstance{};
+
+        auto argument = conv.MakeArgument(nullptr,
+                                          nullptr,
+                                          nullptr,
+                                          input_lengths,
+                                          input_strides,
+                                          filter_lengths,
+                                          weights_strides,
+                                          output_lengths,
+                                          output_strides,
+                                          conv_filter_strides,
+                                          conv_filter_dilations,
+                                          input_left_pads,
+                                          input_right_pads,
+                                          PassThrough{},
+                                          PassThrough{},
+                                          PassThrough{},
+                                          split_k);
+        return conv.IsSupportedArgument(argument);
+    }
+};
+
+using GNHWC = ck::tensor_layout::convolution::GNHWC;
+using NHWGC = ck::tensor_layout::convolution::NHWGC;
+
+using GKYXC = ck::tensor_layout::convolution::GKYXC;
+
+using GNHWK = ck::tensor_layout::convolution::GNHWK;
+using NHWGK = ck::tensor_layout::convolution::NHWGK;
+
+using KernelTypes =
+    ::testing::Types<std::tuple<GNHWK, GKYXC, GNHWC>, std::tuple<NHWGK, GKYXC, NHWGC>>;
+
+template <typename Tuple>
+class TestGroupedConvndBwdWeightV3Default
+    : public TestGroupedConvndBwdWeight<Tuple, ConvBwdWeightDefault>
+{
+};
+
+template <typename Tuple>
+class TestGroupedConvndBwdWeightV3Filter1x1
+    : public TestGroupedConvndBwdWeight<Tuple, Filter1x1Stride1Pad0>
+{
+};
+
+TYPED_TEST_SUITE(TestGroupedConvndBwdWeightV3Default, KernelTypes);
+TYPED_TEST_SUITE(TestGroupedConvndBwdWeightV3Filter1x1, KernelTypes);
+
+TYPED_TEST(TestGroupedConvndBwdWeightV3Filter1x1, SpecializationCheck)
+{
+    // Check filter 3,3 instead of 1,1
+    this->conv_param  = {2, 2, 4, 192, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+    bool is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+
+    // Check strides 2,2 instead of 1,1
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+
+    // Check with pad
+    this->conv_param = {2, 2, 4, 192, 192, {1, 1}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+
+    // Not supported version
+    this->conv_param = {2, 2, 128, 128, 256, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+}
+
+TYPED_TEST(TestGroupedConvndBwdWeightV3Default, VectorLoadCheck)
+{
+    // vector load for A
+    this->conv_param  = {2, 2, 128, 129, 256, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    bool is_supported = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+    // vector load for B, E, Ds
+    this->conv_param = {2, 2, 128, 128, 257, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}};
+    is_supported     = this->template Run<2>();
+    EXPECT_FALSE(is_supported);
+}
diff --git a/test/mx_mfma_op/mx_mfma_op.cpp b/test/mx_mfma_op/mx_mfma_op.cpp
index cc612794f4..f65e89bb82 100644
--- a/test/mx_mfma_op/mx_mfma_op.cpp
+++ b/test/mx_mfma_op/mx_mfma_op.cpp
@@ -30,11 +30,11 @@ bool run_mfma_test(ck::index_t init)
     constexpr auto BLOCK_N = mfma_instr.n_per_blk;
     constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
 
-    const auto mx_mfma_kernel = ck::matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K>;
+    const auto mfma_kernel = ck::matmul<AType, BType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K>;
 
     bool pass = true;
 
-    pass = ck::mfma_test::TestMFMA<decltype(mx_mfma_kernel),
+    pass = ck::mfma_test::TestMFMA<decltype(mfma_kernel),
                                    AType,
                                    BType,
                                    CType,
@@ -45,21 +45,80 @@ bool run_mfma_test(ck::index_t init)
                                    CLayout,
                                    BLOCK_M,
                                    BLOCK_N,
-                                   BLOCK_K>{}(mx_mfma_kernel, init);
+                                   BLOCK_K>{}(mfma_kernel, init);
 
     return pass;
 }
 
 TEST(MFMA, FP8MFMA16x16x128)
 {
-    auto AB_init = 0;
+    auto AB_init = 4;
     auto pass    = run_mfma_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::F32_16x16x128>(AB_init);
     EXPECT_TRUE(pass);
 }
 
 TEST(MFMA, FP8MFMA32x32x64)
 {
-    auto AB_init = 0;
+    auto AB_init = 4;
     auto pass    = run_mfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::F32_32x32x64>(AB_init);
     EXPECT_TRUE(pass);
 }
+
+/**
+ * @brief Run the test for the given MX MFMA instruction
+ *
+ * @param init - selects initialization algorithm for A and B tensors
+ */
+template <typename AType, typename BType, typename CType, ck::MFMA_F8F6F4 mfma>
+bool run_mxmfma_test(ck::index_t init)
+{
+    static_assert(mfma == ck::MFMA_F8F6F4::SCALE_F32_16x16x128 ||
+                      mfma == ck::MFMA_F8F6F4::SCALE_F32_32x32x64,
+                  "Only SCALE_F32_16x16x128 and SCALE_F32_32x32x64 are supported");
+    using ALayout = ck::tensor_layout::gemm::RowMajor;
+    using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+    using CLayout = ck::tensor_layout::gemm::RowMajor;
+
+    using AccType   = float;           // only MFMA_F32 instructions supported
+    using ScaleType = ck::e8m0_bexp_t; // biased exponent type
+
+    ck::mfma_type<static_cast<ck::MfmaInstr>(mfma)> mfma_instr;
+    constexpr auto BLOCK_M = mfma_instr.m_per_blk;
+    constexpr auto BLOCK_N = mfma_instr.n_per_blk;
+    constexpr auto BLOCK_K = mfma_instr.num_input_blks * mfma_instr.k_per_blk;
+    constexpr auto BLOCK_X = 32; // scaling vector size
+
+    const auto mx_mfma_kernel =
+        ck::matmul<AType, BType, ScaleType, CType, AccType, BLOCK_M, BLOCK_N, BLOCK_K, BLOCK_X>;
+
+    bool pass = true;
+
+    pass = ck::mxmfma_test::TestMXMFMA<decltype(mx_mfma_kernel),
+                                       AType,
+                                       BType,
+                                       ScaleType,
+                                       CType,
+                                       ALayout,
+                                       BLayout,
+                                       CLayout,
+                                       BLOCK_M,
+                                       BLOCK_N,
+                                       BLOCK_K,
+                                       BLOCK_X>{}(mx_mfma_kernel, init);
+
+    return pass;
+}
+
+TEST(MXMFMA, MXFP8MFMA16x16x128)
+{
+    auto AB_init = 7;
+    auto pass = run_mxmfma_test<f8_t, f8_t, float, ck::MFMA_F8F6F4::SCALE_F32_16x16x128>(AB_init);
+    EXPECT_TRUE(pass);
+}
+
+TEST(MXMFMA, MXFP8MFMA32x32x64)
+{
+    auto AB_init = 7;
+    auto pass = run_mxmfma_test<f8_t, f8_t, half_t, ck::MFMA_F8F6F4::SCALE_F32_32x32x64>(AB_init);
+    EXPECT_TRUE(pass);
+}
diff --git a/test/mx_mfma_op/mx_mfma_op.hpp b/test/mx_mfma_op/mx_mfma_op.hpp
index e96e1b0b29..1f9091ebc5 100644
--- a/test/mx_mfma_op/mx_mfma_op.hpp
+++ b/test/mx_mfma_op/mx_mfma_op.hpp
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
 #pragma once
 
 #include "ck/ck.hpp"
@@ -7,7 +10,7 @@
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/warp/xdlops_gemm.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
-#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_mx_gemm.hpp"
 #include "ck/library/utility/check_err.hpp"
 
 namespace ck {
@@ -18,7 +21,13 @@ enum class MFMA_F8F6F4
     F32_16x16x128 =
         static_cast<int>(MfmaInstr::mfma_f32_16x16x128f8f6f4), // V_MFMA_F32_16X16X128_F8F6F4
     F32_32x32x64 =
-        static_cast<int>(MfmaInstr::mfma_f32_32x32x64f8f6f4) // V_MFMA_F32_32X32X64_F8F6F4
+        static_cast<int>(MfmaInstr::mfma_f32_32x32x64f8f6f4), // V_MFMA_F32_32X32X64_F8F6F4
+
+    SCALE_F32_16x16x128 = static_cast<int>(
+        MfmaInstr::mfma_scale_f32_16x16x128f8f6f4), // V_MFMA_SCALE_F32_16X16X128_F8F6F4
+    SCALE_F32_32x32x64 = static_cast<int>(
+        MfmaInstr::mfma_scale_f32_32x32x64f8f6f4) // V_MFMA_SCALE_F32_32X32X64_F8F6F4
+
 };
 
 template <typename AFragT, typename BFragT, typename AccumFragT, int32_t BLOCK_M, int32_t BLOCK_N>
@@ -32,6 +41,17 @@ struct mfma_type_selector<AFragT, BFragT, AccumFragT, 16, 16>
         auto op = mfma_type<MfmaInstr::mfma_f32_16x16x128f8f6f4>{};
         op.template run<16, 16, AFragT, BFragT, AccumFragT>(fragA, fragB, fragAcc);
     }
+
+    __device__ void operator()(AFragT const& fragA,
+                               const int32_t scale_a,
+                               BFragT const& fragB,
+                               const int32_t scale_b,
+                               AccumFragT& fragAcc)
+    {
+        auto op = mfma_type<MfmaInstr::mfma_scale_f32_16x16x128f8f6f4>{};
+        op.template run<16, 16, AFragT, BFragT, AccumFragT>(
+            fragA, scale_a, fragB, scale_b, fragAcc);
+    }
 };
 
 template <typename AFragT, typename BFragT, typename AccumFragT>
@@ -42,6 +62,17 @@ struct mfma_type_selector<AFragT, BFragT, AccumFragT, 32, 32>
         auto op = mfma_type<MfmaInstr::mfma_f32_32x32x64f8f6f4>{};
         op.template run<32, 32, AFragT, BFragT, AccumFragT>(fragA, fragB, fragAcc);
     }
+
+    __device__ void operator()(AFragT const& fragA,
+                               const int32_t scale_a,
+                               BFragT const& fragB,
+                               const int32_t scale_b,
+                               AccumFragT& fragAcc)
+    {
+        auto op = mfma_type<MfmaInstr::mfma_scale_f32_32x32x64f8f6f4>{};
+        op.template run<32, 32, AFragT, BFragT, AccumFragT>(
+            fragA, scale_a, fragB, scale_b, fragAcc);
+    }
 };
 
 template <typename VecT>
@@ -52,151 +83,428 @@ static constexpr int32_t vectorSize(const VecT&)
 
 // Define a load function for input A blocks:
 // Size: (BLOCK_M x BLOCK_K)
-// ASSUMPTION:
-// - We want contiguous BLOCK_M sized column neighbors in register.
-// - Data is in col_major format
-// This means:
-// - From A we will load K columns of size BLOCK_M to satisfy our input data
+// - Data is in column major format
+// - Rows are loaded in contiguous chunks that map to corresponding microscales
+// - Each row is loaded in chunks of size 16 and each thread loads 32 elements
 template <typename AType, typename AFragT, int32_t BLOCK_M, int32_t BLOCK_K>
 __device__ AFragT load_A_col_major(AType const* input_ptr)
 {
     // clang-format off
     // Register Mapping for 16x128:                                                        ||    Register Mapping for 32x64:
-    // Size              |   BLOCK_M  |   BLOCK_M   |   BLOCK_M  |   BLOCK_M   |           ||    Size              |   BLOCK_M  |   BLOCK_M   |
-    // M                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 |           ||    M                 | 0  ...  31 |  0  ...  31 |
-    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Vector    ||    Thread Id         | 0  ...  31 | 32  ...  63 | Vector
-    // Register Element   ------------ ------------- ------------ -------------  Element   ||    Register Element   ------------ -------------  Element
-    // Reg 0 [0:7]       |     K0     |     K32     |     K64    |     K96     |  v[0]     ||    Reg 0 [0:7]       |     K0     |     K32     |  v[0]
-    // Reg 0 [8:15]      |     K1     |     K33     |     K65    |     K97     |  v[1]     ||    Reg 0 [8:15]      |     K1     |     K33     |  v[1]
-    // Reg 0 [16:23]     |     K2     |     K34     |     K66    |     K98     |  v[2]     ||    Reg 0 [16:23]     |     K2     |     K34     |  v[2]
-    // Reg 0 [24:31]     |     K3     |     K35     |     K67    |     K99     |  v[3]     ||    Reg 0 [24:31]     |     K3     |     K35     |  v[3]
-    // Reg 1 [0:7]       |     K4     |     K36     |     K68    |     K100    |  v[4]     ||    Reg 1 [0:7]       |     K4     |     K36     |  v[4]
-    // Reg 1 [8:15]      |     K5     |     K37     |     K69    |     K101    |  v[5]     ||    Reg 1 [8:15]      |     K5     |     K37     |  v[5]
-    // Reg 1 [16:23]     |     K6     |     K38     |     K70    |     K102    |  v[6]     ||    Reg 1 [16:23]     |     K6     |     K38     |  v[6]
-    // Reg 1 [24:31]     |     K7     |     K39     |     K71    |     K103    |  v[7]     ||    Reg 1 [24:31]     |     K7     |     K39     |  v[7]
-    // Reg 2 [0:7]       |     K8     |     K40     |     K72    |     K104    |  v[8]     ||    Reg 2 [0:7]       |     K8     |     K40     |  v[8]
-    // Reg 2 [8:15]      |     K9     |     K41     |     K73    |     K105    |  v[9]     ||    Reg 2 [8:15]      |     K9     |     K41     |  v[9]
-    // Reg 2 [16:23]     |     K10    |     K42     |     K74    |     K106    |  v[10]    ||    Reg 2 [16:23]     |     K10    |     K42     |  v[10]
-    // Reg 2 [24:31]     |     K11    |     K43     |     K75    |     K107    |  v[11]    ||    Reg 2 [24:31]     |     K11    |     K43     |  v[11]
-    // Reg 3 [0:7]       |     K12    |     K44     |     K76    |     K108    |  v[12]    ||    Reg 3 [0:7]       |     K12    |     K44     |  v[12]
-    // Reg 3 [8:15]      |     K13    |     K45     |     K77    |     K109    |  v[13]    ||    Reg 3 [8:15]      |     K13    |     K45     |  v[13]
-    // Reg 3 [16:23]     |     K14    |     K46     |     K78    |     K110    |  v[14]    ||    Reg 3 [16:23]     |     K14    |     K46     |  v[14]
-    // Reg 3 [24:31]     |     K15    |     K47     |     K79    |     K111    |  v[15]    ||    Reg 3 [24:31]     |     K15    |     K47     |  v[15]
-    // Reg 4 [0:7]       |     K16    |     K48     |     K80    |     K112    |  v[16]    ||    Reg 4 [0:7]       |     K16    |     K48     |  v[16]
-    // Reg 4 [8:15]      |     K17    |     K49     |     K81    |     K113    |  v[17]    ||    Reg 4 [8:15]      |     K17    |     K49     |  v[17]
-    // Reg 4 [16:23]     |     K18    |     K50     |     K82    |     K114    |  v[18]    ||    Reg 4 [16:23]     |     K18    |     K50     |  v[18]
-    // Reg 4 [24:31]     |     K19    |     K51     |     K83    |     K115    |  v[19]    ||    Reg 4 [24:31]     |     K19    |     K51     |  v[19]
-    // Reg 5 [0:7]       |     K20    |     K52     |     K84    |     K116    |  v[20]    ||    Reg 5 [0:7]       |     K20    |     K52     |  v[20]
-    // Reg 5 [8:15]      |     K21    |     K53     |     K85    |     K117    |  v[21]    ||    Reg 5 [8:15]      |     K21    |     K53     |  v[21]
-    // Reg 5 [16:23]     |     K22    |     K54     |     K86    |     K118    |  v[22]    ||    Reg 5 [16:23]     |     K22    |     K54     |  v[22]
-    // Reg 5 [24:31]     |     K23    |     K55     |     K87    |     K119    |  v[23]    ||    Reg 5 [24:31]     |     K23    |     K55     |  v[23]
-    // Reg 6 [0:7]       |     K24    |     K56     |     K88    |     K120    |  v[24]    ||    Reg 6 [0:7]       |     K24    |     K56     |  v[24]
-    // Reg 6 [8:15]      |     K25    |     K57     |     K89    |     K121    |  v[25]    ||    Reg 6 [8:15]      |     K25    |     K57     |  v[25]
-    // Reg 6 [16:23]     |     K26    |     K58     |     K90    |     K122    |  v[26]    ||    Reg 6 [16:23]     |     K26    |     K58     |  v[26]
-    // Reg 6 [24:31]     |     K27    |     K59     |     K91    |     K123    |  v[27]    ||    Reg 6 [24:31]     |     K27    |     K59     |  v[27]
-    // Reg 7 [0:7]       |     K28    |     K60     |     K92    |     K124    |  v[28]    ||    Reg 7 [0:7]       |     K28    |     K60     |  v[28]
-    // Reg 7 [8:15]      |     K29    |     K61     |     K93    |     K125    |  v[29]    ||    Reg 7 [8:15]      |     K29    |     K61     |  v[29]
-    // Reg 7 [16:23]     |     K30    |     K62     |     K94    |     K126    |  v[30]    ||    Reg 7 [16:23]     |     K30    |     K62     |  v[30]
-    // Reg 7 [24:31]     |     K31    |     K63     |     K95    |     K127    |  v[31]    ||    Reg 7 [24:31]     |     K31    |     K63     |  v[31]
+    // Size              |   BLOCK_M  |   BLOCK_M   |   BLOCK_M  |   BLOCK_M   |           ||    Size              |   BLOCK_M  |   BLOCK_M   |        |
+    // M                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    M                 | 0  ...  31 |  0  ...  31 | Vector |
+    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
+    // Register Element  |------------|-------------|------------|-------------|-----------||    Register Element  |------------|-------------|--------|
+    // Reg 0 [0:7]       |     K0     |     K16     |     K32    |     K48     |  v[0]     ||    Reg 0 [0:7]       |     K0     |     K16     |  v[0]  |
+    // Reg 0 [8:15]      |     K1     |     K17     |     K33    |     K49     |  v[1]     ||    Reg 0 [8:15]      |     K1     |     K17     |  v[1]  |
+    // Reg 0 [16:23]     |     K2     |     K18     |     K34    |     K50     |  v[2]     ||    Reg 0 [16:23]     |     K2     |     K18     |  v[2]  |
+    // Reg 0 [24:31]     |     K3     |     K19     |     K35    |     K51     |  v[3]     ||    Reg 0 [24:31]     |     K3     |     K19     |  v[3]  |
+    // Reg 1 [0:7]       |     K4     |     K20     |     K36    |     K52     |  v[4]     ||    Reg 1 [0:7]       |     K4     |     K20     |  v[4]  |
+    // Reg 1 [8:15]      |     K5     |     K21     |     K37    |     K53     |  v[5]     ||    Reg 1 [8:15]      |     K5     |     K21     |  v[5]  |
+    // Reg 1 [16:23]     |     K6     |     K22     |     K38    |     K54     |  v[6]     ||    Reg 1 [16:23]     |     K6     |     K22     |  v[6]  |
+    // Reg 1 [24:31]     |     K7     |     K23     |     K39    |     K55     |  v[7]     ||    Reg 1 [24:31]     |     K7     |     K23     |  v[7]  |
+    // Reg 2 [0:7]       |     K8     |     K24     |     K40    |     K56     |  v[8]     ||    Reg 2 [0:7]       |     K8     |     K24     |  v[8]  |
+    // Reg 2 [8:15]      |     K9     |     K25     |     K41    |     K57     |  v[9]     ||    Reg 2 [8:15]      |     K9     |     K25     |  v[9]  |
+    // Reg 2 [16:23]     |     K10    |     K26     |     K42    |     K58     |  v[10]    ||    Reg 2 [16:23]     |     K10    |     K26     |  v[10] |
+    // Reg 2 [24:31]     |     K11    |     K27     |     K43    |     K59     |  v[11]    ||    Reg 2 [24:31]     |     K11    |     K27     |  v[11] |
+    // Reg 3 [0:7]       |     K12    |     K28     |     K44    |     K60     |  v[12]    ||    Reg 3 [0:7]       |     K12    |     K28     |  v[12] |
+    // Reg 3 [8:15]      |     K13    |     K29     |     K45    |     K61     |  v[13]    ||    Reg 3 [8:15]      |     K13    |     K29     |  v[13] |
+    // Reg 3 [16:23]     |     K14    |     K30     |     K46    |     K62     |  v[14]    ||    Reg 3 [16:23]     |     K14    |     K30     |  v[14] |
+    // Reg 3 [24:31]     |     K15    |     K31     |     K47    |     K63     |  v[15]    ||    Reg 3 [24:31]     |     K15    |     K31     |  v[15] |
+    // Reg 4 [0:7]       |     K64    |     K80     |     K96    |     K112    |  v[16]    ||    Reg 4 [0:7]       |     K32    |     K48     |  v[16] |
+    // Reg 4 [8:15]      |     K65    |     K81     |     K97    |     K113    |  v[17]    ||    Reg 4 [8:15]      |     K33    |     K49     |  v[17] |
+    // Reg 4 [16:23]     |     K66    |     K82     |     K98    |     K114    |  v[18]    ||    Reg 4 [16:23]     |     K34    |     K50     |  v[18] |
+    // Reg 4 [24:31]     |     K67    |     K83     |     K99    |     K115    |  v[19]    ||    Reg 4 [24:31]     |     K35    |     K51     |  v[19] |
+    // Reg 5 [0:7]       |     K68    |     K84     |     K100   |     K116    |  v[20]    ||    Reg 5 [0:7]       |     K36    |     K52     |  v[20] |
+    // Reg 5 [8:15]      |     K69    |     K85     |     K101   |     K117    |  v[21]    ||    Reg 5 [8:15]      |     K37    |     K53     |  v[21] |
+    // Reg 5 [16:23]     |     K70    |     K86     |     K102   |     K118    |  v[22]    ||    Reg 5 [16:23]     |     K38    |     K54     |  v[22] |
+    // Reg 5 [24:31]     |     K71    |     K87     |     K103   |     K119    |  v[23]    ||    Reg 5 [24:31]     |     K39    |     K55     |  v[23] |
+    // Reg 6 [0:7]       |     K72    |     K88     |     K104   |     K120    |  v[24]    ||    Reg 6 [0:7]       |     K40    |     K56     |  v[24] |
+    // Reg 6 [8:15]      |     K73    |     K89     |     K105   |     K121    |  v[25]    ||    Reg 6 [8:15]      |     K41    |     K57     |  v[25] |
+    // Reg 6 [16:23]     |     K74    |     K90     |     K106   |     K122    |  v[26]    ||    Reg 6 [16:23]     |     K42    |     K58     |  v[26] |
+    // Reg 6 [24:31]     |     K75    |     K91     |     K107   |     K123    |  v[27]    ||    Reg 6 [24:31]     |     K43    |     K59     |  v[27] |
+    // Reg 7 [0:7]       |     K76    |     K92     |     K108   |     K124    |  v[28]    ||    Reg 7 [0:7]       |     K44    |     K60     |  v[28] |
+    // Reg 7 [8:15]      |     K77    |     K93     |     K109   |     K125    |  v[29]    ||    Reg 7 [8:15]      |     K45    |     K61     |  v[29] |
+    // Reg 7 [16:23]     |     K78    |     K94     |     K110   |     K126    |  v[30]    ||    Reg 7 [16:23]     |     K46    |     K62     |  v[30] |
+    // Reg 7 [24:31]     |     K79    |     K95     |     K111   |     K127    |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |
     // clang-format on
 
-    // Here we want to load a BLOCK_M x BLOCK_K block of data.
-    static constexpr uint32_t VW = vectorSize(AFragT{});
-    using ARawT                  = typename scalar_type<AFragT>::type;
-    using AScalarFragT           = vector_type<ARawT, VW>::type;
+    static constexpr int32_t WAVE_SIZE = 64;
+
+    // Here we want to load from rows of A in chunks of 16 elements each.
+    static constexpr uint32_t chunk_size = 16;
+
+    // each chunk is separated by offset
+    static constexpr uint32_t chunk_offset = chunk_size * WAVE_SIZE / BLOCK_M;
 
     // To start the loading process, let's visualize in 2D coords.
     // Each thread will load 32 elements.
     // We need to know where they start, and where the next elements are.
-    auto startCoord2D = std::make_pair(threadIdx.x % BLOCK_M,         // Row
-                                       (threadIdx.x / BLOCK_M) * VW); // Col
-    auto stepCoord2D  = std::make_pair(0u, 1u);
+    auto startCoord2D =
+        std::make_pair(threadIdx.x % BLOCK_M,                 // Row {0-31}  |  {0-15}
+                       (threadIdx.x / BLOCK_M) * chunk_size); // Col {0, 16} |  {0, 16, 32, 48}
+
+    auto minorStepCoord2D = std::make_pair(0u, 1u);          // read rows
+    auto majorStepCoord2D = std::make_pair(0, chunk_offset); // read a chunk from a row
 
     // Flatten to 1D col_major offsets.
     auto col_major = [](auto const& coord, auto ld) { return coord.first + coord.second * ld; };
 
     // BLOCK_M is a stride in A matrix
-    auto startOffset = col_major(startCoord2D, BLOCK_M);
-    auto kOffset     = col_major(stepCoord2D, BLOCK_M);
+    auto startOffset  = col_major(startCoord2D, BLOCK_M);
+    auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_M);
+    auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_M);
 
-    // kOffset == BLOCK_M
-    // This means every BLOCK_M element is loaded into output vector
-    auto fragA = AScalarFragT{};
-#pragma unroll VW
-    for(uint32_t i = 0; i < VW; i++)
+    using ARawT        = typename scalar_type<AFragT>::type;
+    using AScalarFragT = vector_type<ARawT, vectorSize(AFragT{})>::type;
+
+    AScalarFragT fragA{};
+
+#pragma unroll
+    for(int chunk = 0; chunk < 2; chunk++)
     {
-        fragA[i] = bit_cast<ARawT>(input_ptr[startOffset + i * kOffset]);
+#pragma unroll
+        for(uint32_t i = 0; i < chunk_size; i++)
+        {
+            fragA[chunk * chunk_size + i] =
+                bit_cast<ARawT>(input_ptr[startOffset + chunk * kMajorOffset + i * kMinorOffset]);
+        }
     }
 
     return fragA;
 }
 
+// Define a load function for input A blocks:
+// Size: (BLOCK_M x BLOCK_K)
+// - Data is in row major format
+// - Rows are loaded in contiguous chunks that map to corresponding microscales
+// - Each row is loaded in chunks of size 16 and each thread loads 32 elements
+template <typename AType, typename AFragT, int32_t BLOCK_M, int32_t BLOCK_K>
+__device__ AFragT load_A_row_major(AType const* input_ptr)
+{
+    // clang-format off
+    // Register Mapping for 16x128:                                                        ||    Register Mapping for 32x64:
+    // Size              |   BLOCK_M  |   BLOCK_M   |   BLOCK_M  |   BLOCK_M   |           ||    Size              |   BLOCK_M  |   BLOCK_M   |        |
+    // M                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    M                 | 0  ...  31 |  0  ...  31 | Vector |
+    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
+    // Register Element  |------------|-------------|------------|-------------|-----------||    Register Element  |------------|-------------|--------|
+    // Reg 0 [0:7]       |     K0     |     K16     |     K32    |     K48     |  v[0]     ||    Reg 0 [0:7]       |     K0     |     K16     |  v[0]  |
+    // Reg 0 [8:15]      |     K1     |     K17     |     K33    |     K49     |  v[1]     ||    Reg 0 [8:15]      |     K1     |     K17     |  v[1]  |
+    // Reg 0 [16:23]     |     K2     |     K18     |     K34    |     K50     |  v[2]     ||    Reg 0 [16:23]     |     K2     |     K18     |  v[2]  |
+    // Reg 0 [24:31]     |     K3     |     K19     |     K35    |     K51     |  v[3]     ||    Reg 0 [24:31]     |     K3     |     K19     |  v[3]  |
+    // Reg 1 [0:7]       |     K4     |     K20     |     K36    |     K52     |  v[4]     ||    Reg 1 [0:7]       |     K4     |     K20     |  v[4]  |
+    // Reg 1 [8:15]      |     K5     |     K21     |     K37    |     K53     |  v[5]     ||    Reg 1 [8:15]      |     K5     |     K21     |  v[5]  |
+    // Reg 1 [16:23]     |     K6     |     K22     |     K38    |     K54     |  v[6]     ||    Reg 1 [16:23]     |     K6     |     K22     |  v[6]  |
+    // Reg 1 [24:31]     |     K7     |     K23     |     K39    |     K55     |  v[7]     ||    Reg 1 [24:31]     |     K7     |     K23     |  v[7]  |
+    // Reg 2 [0:7]       |     K8     |     K24     |     K40    |     K56     |  v[8]     ||    Reg 2 [0:7]       |     K8     |     K24     |  v[8]  |
+    // Reg 2 [8:15]      |     K9     |     K25     |     K41    |     K57     |  v[9]     ||    Reg 2 [8:15]      |     K9     |     K25     |  v[9]  |
+    // Reg 2 [16:23]     |     K10    |     K26     |     K42    |     K58     |  v[10]    ||    Reg 2 [16:23]     |     K10    |     K26     |  v[10] |
+    // Reg 2 [24:31]     |     K11    |     K27     |     K43    |     K59     |  v[11]    ||    Reg 2 [24:31]     |     K11    |     K27     |  v[11] |
+    // Reg 3 [0:7]       |     K12    |     K28     |     K44    |     K60     |  v[12]    ||    Reg 3 [0:7]       |     K12    |     K28     |  v[12] |
+    // Reg 3 [8:15]      |     K13    |     K29     |     K45    |     K61     |  v[13]    ||    Reg 3 [8:15]      |     K13    |     K29     |  v[13] |
+    // Reg 3 [16:23]     |     K14    |     K30     |     K46    |     K62     |  v[14]    ||    Reg 3 [16:23]     |     K14    |     K30     |  v[14] |
+    // Reg 3 [24:31]     |     K15    |     K31     |     K47    |     K63     |  v[15]    ||    Reg 3 [24:31]     |     K15    |     K31     |  v[15] |
+    // Reg 4 [0:7]       |     K64    |     K80     |     K96    |     K112    |  v[16]    ||    Reg 4 [0:7]       |     K32    |     K48     |  v[16] |
+    // Reg 4 [8:15]      |     K65    |     K81     |     K97    |     K113    |  v[17]    ||    Reg 4 [8:15]      |     K33    |     K49     |  v[17] |
+    // Reg 4 [16:23]     |     K66    |     K82     |     K98    |     K114    |  v[18]    ||    Reg 4 [16:23]     |     K34    |     K50     |  v[18] |
+    // Reg 4 [24:31]     |     K67    |     K83     |     K99    |     K115    |  v[19]    ||    Reg 4 [24:31]     |     K35    |     K51     |  v[19] |
+    // Reg 5 [0:7]       |     K68    |     K84     |     K100   |     K116    |  v[20]    ||    Reg 5 [0:7]       |     K36    |     K52     |  v[20] |
+    // Reg 5 [8:15]      |     K69    |     K85     |     K101   |     K117    |  v[21]    ||    Reg 5 [8:15]      |     K37    |     K53     |  v[21] |
+    // Reg 5 [16:23]     |     K70    |     K86     |     K102   |     K118    |  v[22]    ||    Reg 5 [16:23]     |     K38    |     K54     |  v[22] |
+    // Reg 5 [24:31]     |     K71    |     K87     |     K103   |     K119    |  v[23]    ||    Reg 5 [24:31]     |     K39    |     K55     |  v[23] |
+    // Reg 6 [0:7]       |     K72    |     K88     |     K104   |     K120    |  v[24]    ||    Reg 6 [0:7]       |     K40    |     K56     |  v[24] |
+    // Reg 6 [8:15]      |     K73    |     K89     |     K105   |     K121    |  v[25]    ||    Reg 6 [8:15]      |     K41    |     K57     |  v[25] |
+    // Reg 6 [16:23]     |     K74    |     K90     |     K106   |     K122    |  v[26]    ||    Reg 6 [16:23]     |     K42    |     K58     |  v[26] |
+    // Reg 6 [24:31]     |     K75    |     K91     |     K107   |     K123    |  v[27]    ||    Reg 6 [24:31]     |     K43    |     K59     |  v[27] |
+    // Reg 7 [0:7]       |     K76    |     K92     |     K108   |     K124    |  v[28]    ||    Reg 7 [0:7]       |     K44    |     K60     |  v[28] |
+    // Reg 7 [8:15]      |     K77    |     K93     |     K109   |     K125    |  v[29]    ||    Reg 7 [8:15]      |     K45    |     K61     |  v[29] |
+    // Reg 7 [16:23]     |     K78    |     K94     |     K110   |     K126    |  v[30]    ||    Reg 7 [16:23]     |     K46    |     K62     |  v[30] |
+    // Reg 7 [24:31]     |     K79    |     K95     |     K111   |     K127    |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |
+    // clang-format on
+
+    static constexpr int32_t WAVE_SIZE = 64;
+
+    // Here we want to load from rows of A in chunks of 16 elements each.
+    static constexpr uint32_t chunk_size = 16;
+
+    // each chunk is separated by offset
+    static constexpr uint32_t chunk_offset = chunk_size * WAVE_SIZE / BLOCK_M;
+
+    // To start the loading process, let's visualize in 2D coords.
+    // Each thread will load 32 elements.
+    // We need to know where they start, and where the next elements are.
+    auto startCoord2D =
+        std::make_pair(threadIdx.x % BLOCK_M,                 // Row {0-31}  |  {0-15}
+                       (threadIdx.x / BLOCK_M) * chunk_size); // Col {0, 16} |  {0, 16, 32, 48}
+
+    // auto minorStepCoord2D = std::make_pair(0u, 1u);          // read rows
+    auto majorStepCoord2D = std::make_pair(0, chunk_offset); // read a chunk from a row
+
+    // Flatten to 1D row_major offsets.
+    auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
+
+    // BLOCK_K is a stride in A matrix
+    auto startOffset = row_major(startCoord2D, BLOCK_K);
+    // auto kMinorOffset = row_major(minorStepCoord2D, BLOCK_K);
+    auto kMajorOffset = row_major(majorStepCoord2D, BLOCK_K);
+
+    using ARawT        = typename scalar_type<AFragT>::type;
+    using AScalarFragT = vector_type<ARawT, chunk_size>::type;
+
+    union
+    {
+        AFragT frag;
+        AScalarFragT chunks[2];
+    } fragA{};
+
+    auto* fragPtr   = reinterpret_cast<AScalarFragT const*>(input_ptr + startOffset);
+    fragA.chunks[0] = *fragPtr;
+    fragPtr         = reinterpret_cast<AScalarFragT const*>(input_ptr + startOffset + kMajorOffset);
+    fragA.chunks[1] = *fragPtr;
+
+    return fragA.frag;
+}
+
+// Define a load function for scaled A blocks:
+// Size: (BLOCK_M x BLOCK_K)
+// ASSUMPTION:
+// - The scale inputs distributed across 64 lanes.
+template <typename AType,
+          typename AFragT,
+          typename ScaleType,
+          typename ScaleFragT,
+          int32_t BLOCK_M,
+          int32_t BLOCK_K,
+          int32_t BLOCK_X>
+__device__ AFragT load_mx_A_row_major(AType const* input_ptr,
+                                      ScaleType const* scale_ptr,
+                                      ScaleFragT& fragX)
+{
+    // clang-format off
+    // Register Mapping for 16x128:                                                                              ||    Register Mapping for 32x64:
+    // Size              |   BLOCK_M  |   BLOCK_M   |          |   BLOCK_M  |   BLOCK_M   |          |           ||    Size              |   BLOCK_M  |   BLOCK_M   |        |          |
+    // M                 | 0  ...  15 |  0  ...  15 |          | 0  ...  15 |  0  ...  15 |          | Vector    ||    M                 | 0  ...  31 |  0  ...  31 | Vector |          |
+    // Thread Id         | 0  ...  15 | 16  ...  31 |  Scale   | 32  ... 47 | 48  ...  63 |  Scale   | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|  Scale   |
+    // Register Element   ------------ ------------- ----------|------------ ------------- ----------|-----------||    Register Element  |------------|-------------|--------|----------|
+    // Reg 0 [0:7]       |     K0     |     K16     |  x(M,0)  |     K32    |     K48     |  x(M,1)  |  v[0]     ||    Reg 0 [0:7]       |     K0     |     K16     |  v[0]  |  x(M,0)  |
+    // Reg 0 [8:15]      |     K1     |     K17     |  x(M,0)  |     K33    |     K49     |  x(M,1)  |  v[1]     ||    Reg 0 [8:15]      |     K1     |     K17     |  v[1]  |  x(M,0)  |
+    // Reg 0 [16:23]     |     K2     |     K18     |  x(M,0)  |     K34    |     K50     |  x(M,1)  |  v[2]     ||    Reg 0 [16:23]     |     K2     |     K18     |  v[2]  |  x(M,0)  |
+    // Reg 0 [24:31]     |     K3     |     K19     |  x(M,0)  |     K35    |     K51     |  x(M,1)  |  v[3]     ||    Reg 0 [24:31]     |     K3     |     K19     |  v[3]  |  x(M,0)  |
+    // Reg 1 [0:7]       |     K4     |     K20     |  x(M,0)  |     K36    |     K52     |  x(M,1)  |  v[4]     ||    Reg 1 [0:7]       |     K4     |     K20     |  v[4]  |  x(M,0)  |
+    // Reg 1 [8:15]      |     K5     |     K21     |  x(M,0)  |     K37    |     K53     |  x(M,1)  |  v[5]     ||    Reg 1 [8:15]      |     K5     |     K21     |  v[5]  |  x(M,0)  |
+    // Reg 1 [16:23]     |     K6     |     K22     |  x(M,0)  |     K38    |     K54     |  x(M,1)  |  v[6]     ||    Reg 1 [16:23]     |     K6     |     K22     |  v[6]  |  x(M,0)  |
+    // Reg 1 [24:31]     |     K7     |     K23     |  x(M,0)  |     K39    |     K55     |  x(M,1)  |  v[7]     ||    Reg 1 [24:31]     |     K7     |     K23     |  v[7]  |  x(M,0)  |
+    // Reg 2 [0:7]       |     K8     |     K24     |  x(M,0)  |     K40    |     K56     |  x(M,1)  |  v[8]     ||    Reg 2 [0:7]       |     K8     |     K24     |  v[8]  |  x(M,0)  |
+    // Reg 2 [8:15]      |     K9     |     K25     |  x(M,0)  |     K41    |     K57     |  x(M,1)  |  v[9]     ||    Reg 2 [8:15]      |     K9     |     K25     |  v[9]  |  x(M,0)  |
+    // Reg 2 [16:23]     |     K10    |     K26     |  x(M,0)  |     K42    |     K58     |  x(M,1)  |  v[10]    ||    Reg 2 [16:23]     |     K10    |     K26     |  v[10] |  x(M,0)  |
+    // Reg 2 [24:31]     |     K11    |     K27     |  x(M,0)  |     K43    |     K59     |  x(M,1)  |  v[11]    ||    Reg 2 [24:31]     |     K11    |     K27     |  v[11] |  x(M,0)  |
+    // Reg 3 [0:7]       |     K12    |     K28     |  x(M,0)  |     K44    |     K60     |  x(M,1)  |  v[12]    ||    Reg 3 [0:7]       |     K12    |     K28     |  v[12] |  x(M,0)  |
+    // Reg 3 [8:15]      |     K13    |     K29     |  x(M,0)  |     K45    |     K61     |  x(M,1)  |  v[13]    ||    Reg 3 [8:15]      |     K13    |     K29     |  v[13] |  x(M,0)  |
+    // Reg 3 [16:23]     |     K14    |     K30     |  x(M,0)  |     K46    |     K62     |  x(M,1)  |  v[14]    ||    Reg 3 [16:23]     |     K14    |     K30     |  v[14] |  x(M,0)  |
+    // Reg 3 [24:31]     |     K15    |     K31     |  x(M,0)  |     K47    |     K63     |  x(M,1)  |  v[15]    ||    Reg 3 [24:31]     |     K15    |     K31     |  v[15] |  x(M,0)  |
+    // Reg 4 [0:7]       |     K64    |     K80     |  x(M,2)  |     K96    |     K112    |  x(M,3)  |  v[16]    ||    Reg 4 [0:7]       |     K32    |     K48     |  v[16] |  x(M,1)  |
+    // Reg 4 [8:15]      |     K65    |     K81     |  x(M,2)  |     K97    |     K113    |  x(M,3)  |  v[17]    ||    Reg 4 [8:15]      |     K33    |     K49     |  v[17] |  x(M,1)  |
+    // Reg 4 [16:23]     |     K66    |     K82     |  x(M,2)  |     K98    |     K114    |  x(M,3)  |  v[18]    ||    Reg 4 [16:23]     |     K34    |     K50     |  v[18] |  x(M,1)  |
+    // Reg 4 [24:31]     |     K67    |     K83     |  x(M,2)  |     K99    |     K115    |  x(M,3)  |  v[19]    ||    Reg 4 [24:31]     |     K35    |     K51     |  v[19] |  x(M,1)  |
+    // Reg 5 [0:7]       |     K68    |     K84     |  x(M,2)  |     K100   |     K116    |  x(M,3)  |  v[20]    ||    Reg 5 [0:7]       |     K36    |     K52     |  v[20] |  x(M,1)  |
+    // Reg 5 [8:15]      |     K69    |     K85     |  x(M,2)  |     K101   |     K117    |  x(M,3)  |  v[21]    ||    Reg 5 [8:15]      |     K37    |     K53     |  v[21] |  x(M,1)  |
+    // Reg 5 [16:23]     |     K70    |     K86     |  x(M,2)  |     K102   |     K118    |  x(M,3)  |  v[22]    ||    Reg 5 [16:23]     |     K38    |     K54     |  v[22] |  x(M,1)  |
+    // Reg 5 [24:31]     |     K71    |     K87     |  x(M,2)  |     K103   |     K119    |  x(M,3)  |  v[23]    ||    Reg 5 [24:31]     |     K39    |     K55     |  v[23] |  x(M,1)  |
+    // Reg 6 [0:7]       |     K72    |     K88     |  x(M,2)  |     K104   |     K120    |  x(M,3)  |  v[24]    ||    Reg 6 [0:7]       |     K40    |     K56     |  v[24] |  x(M,1)  |
+    // Reg 6 [8:15]      |     K73    |     K89     |  x(M,2)  |     K105   |     K121    |  x(M,3)  |  v[25]    ||    Reg 6 [8:15]      |     K41    |     K57     |  v[25] |  x(M,1)  |
+    // Reg 6 [16:23]     |     K74    |     K90     |  x(M,2)  |     K106   |     K122    |  x(M,3)  |  v[26]    ||    Reg 6 [16:23]     |     K42    |     K58     |  v[26] |  x(M,1)  |
+    // Reg 6 [24:31]     |     K75    |     K91     |  x(M,2)  |     K107   |     K123    |  x(M,3)  |  v[27]    ||    Reg 6 [24:31]     |     K43    |     K59     |  v[27] |  x(M,1)  |
+    // Reg 7 [0:7]       |     K76    |     K92     |  x(M,2)  |     K108   |     K124    |  x(M,3)  |  v[28]    ||    Reg 7 [0:7]       |     K44    |     K60     |  v[28] |  x(M,1)  |
+    // Reg 7 [8:15]      |     K77    |     K93     |  x(M,2)  |     K109   |     K125    |  x(M,3)  |  v[29]    ||    Reg 7 [8:15]      |     K45    |     K61     |  v[29] |  x(M,1)  |
+    // Reg 7 [16:23]     |     K78    |     K94     |  x(M,2)  |     K110   |     K126    |  x(M,3)  |  v[30]    ||    Reg 7 [16:23]     |     K46    |     K62     |  v[30] |  x(M,1)  |
+    // Reg 7 [24:31]     |     K79    |     K95     |  x(M,2)  |     K111   |     K127    |  x(M,3)  |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |  x(M,1)  |
+    // clang-format on
+    static constexpr uint32_t VW = vectorSize(AFragT{});
+    static_assert(VW == BLOCK_X, "Fragment size must be equal to BLOCK_X");
+
+    // To start the loading process, let's visualize in 2D coords.
+    // Each thread will load 1 element
+    // We need to know where they start
+    auto startCoord2D = std::make_pair(threadIdx.x % BLOCK_M,                   // Row
+                                       (threadIdx.x / BLOCK_M) * VW / BLOCK_X); // Col
+
+    // Flatten to 1D row_major offsets.
+    auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
+
+    // BLOCK_K / BLOCK_X is a stride in xA matrix
+    auto startOffset = row_major(startCoord2D, BLOCK_K / BLOCK_X);
+
+    // obtain 8-bit exponent
+    fragX = utils::get_exponent_value(scale_ptr[startOffset]) & 0xFF;
+
+    return load_A_row_major<AType, AFragT, BLOCK_M, BLOCK_K>(input_ptr);
+}
+
 // Define a load function for input B blocks:
 // Size: (BLOCK_K x BLOCK_N)
-// ASSUMPTION:
-// - We want contiguous BLOCK_N sized row neighbors in register.
-// - Data is in row_major format
-// This means:
-// - From B we will load K rows of size BLOCK_N to satisfy our input data
+// - Data is in col major format
+// - Cols are loaded in contiguous chunks that map to corresponding microscales
+// - Each col is loaded in chunks of size 16 and each thread loads 32 elements
 template <typename BType, typename BFragT, int32_t BLOCK_K, int32_t BLOCK_N>
 __device__ BFragT load_B_col_major(BType const* input_ptr)
 {
     // clang-format off
     // Register Mapping for 128x16:                                                        ||    Register Mapping for 64x32:
-    // Size              |   BLOCK_N  |   BLOCK_N   |   BLOCK_N  |   BLOCK_N   |           ||    Size              |   BLOCK_N  |   BLOCK_N   |
-    // N                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 |           ||    N                 | 0  ...  31 |  0  ...  31 |
-    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Vector    ||    Thread Id         | 0  ...  31 | 32  ...  63 | Vector
-    // Register Element   ------------ ------------- ------------ -------------  Element   ||    Register Element   ------------ -------------  Element
-    // Reg 0 [0:7]       |     K0     |     K32     |     K64    |     K96     |  v[0]     ||    Reg 0 [0:7]       |     K0     |     K32     |  v[0]
-    // Reg 0 [8:15]      |     K1     |     K33     |     K65    |     K97     |  v[1]     ||    Reg 0 [8:15]      |     K1     |     K33     |  v[1]
-    // Reg 0 [16:23]     |     K2     |     K34     |     K66    |     K98     |  v[2]     ||    Reg 0 [16:23]     |     K2     |     K34     |  v[2]
-    // Reg 0 [24:31]     |     K3     |     K35     |     K67    |     K99     |  v[3]     ||    Reg 0 [24:31]     |     K3     |     K35     |  v[3]
-    // Reg 1 [0:7]       |     K4     |     K36     |     K68    |     K100    |  v[4]     ||    Reg 1 [0:7]       |     K4     |     K36     |  v[4]
-    // Reg 1 [8:15]      |     K5     |     K37     |     K69    |     K101    |  v[5]     ||    Reg 1 [8:15]      |     K5     |     K37     |  v[5]
-    // Reg 1 [16:23]     |     K6     |     K38     |     K70    |     K102    |  v[6]     ||    Reg 1 [16:23]     |     K6     |     K38     |  v[6]
-    // Reg 1 [24:31]     |     K7     |     K39     |     K71    |     K103    |  v[7]     ||    Reg 1 [24:31]     |     K7     |     K39     |  v[7]
-    // Reg 2 [0:7]       |     K8     |     K40     |     K72    |     K104    |  v[8]     ||    Reg 2 [0:7]       |     K8     |     K40     |  v[8]
-    // Reg 2 [8:15]      |     K9     |     K41     |     K73    |     K105    |  v[9]     ||    Reg 2 [8:15]      |     K9     |     K41     |  v[9]
-    // Reg 2 [16:23]     |     K10    |     K42     |     K74    |     K106    |  v[10]    ||    Reg 2 [16:23]     |     K10    |     K42     |  v[10]
-    // Reg 2 [24:31]     |     K11    |     K43     |     K75    |     K107    |  v[11]    ||    Reg 2 [24:31]     |     K11    |     K43     |  v[11]
-    // Reg 3 [0:7]       |     K12    |     K44     |     K76    |     K108    |  v[12]    ||    Reg 3 [0:7]       |     K12    |     K44     |  v[12]
-    // Reg 3 [8:15]      |     K13    |     K45     |     K77    |     K109    |  v[13]    ||    Reg 3 [8:15]      |     K13    |     K45     |  v[13]
-    // Reg 3 [16:23]     |     K14    |     K46     |     K78    |     K110    |  v[14]    ||    Reg 3 [16:23]     |     K14    |     K46     |  v[14]
-    // Reg 3 [24:31]     |     K15    |     K47     |     K79    |     K111    |  v[15]    ||    Reg 3 [24:31]     |     K15    |     K47     |  v[15]
-    // Reg 4 [0:7]       |     K16    |     K48     |     K80    |     K112    |  v[16]    ||    Reg 4 [0:7]       |     K16    |     K48     |  v[16]
-    // Reg 4 [8:15]      |     K17    |     K49     |     K81    |     K113    |  v[17]    ||    Reg 4 [8:15]      |     K17    |     K49     |  v[17]
-    // Reg 4 [16:23]     |     K18    |     K50     |     K82    |     K114    |  v[18]    ||    Reg 4 [16:23]     |     K18    |     K50     |  v[18]
-    // Reg 4 [24:31]     |     K19    |     K51     |     K83    |     K115    |  v[19]    ||    Reg 4 [24:31]     |     K19    |     K51     |  v[19]
-    // Reg 5 [0:7]       |     K20    |     K52     |     K84    |     K116    |  v[20]    ||    Reg 5 [0:7]       |     K20    |     K52     |  v[20]
-    // Reg 5 [8:15]      |     K21    |     K53     |     K85    |     K117    |  v[21]    ||    Reg 5 [8:15]      |     K21    |     K53     |  v[21]
-    // Reg 5 [16:23]     |     K22    |     K54     |     K86    |     K118    |  v[22]    ||    Reg 5 [16:23]     |     K22    |     K54     |  v[22]
-    // Reg 5 [24:31]     |     K23    |     K55     |     K87    |     K119    |  v[23]    ||    Reg 5 [24:31]     |     K23    |     K55     |  v[23]
-    // Reg 6 [0:7]       |     K24    |     K56     |     K88    |     K120    |  v[24]    ||    Reg 6 [0:7]       |     K24    |     K56     |  v[24]
-    // Reg 6 [8:15]      |     K25    |     K57     |     K89    |     K121    |  v[25]    ||    Reg 6 [8:15]      |     K25    |     K57     |  v[25]
-    // Reg 6 [16:23]     |     K26    |     K58     |     K90    |     K122    |  v[26]    ||    Reg 6 [16:23]     |     K26    |     K58     |  v[26]
-    // Reg 6 [24:31]     |     K27    |     K59     |     K91    |     K123    |  v[27]    ||    Reg 6 [24:31]     |     K27    |     K59     |  v[27]
-    // Reg 7 [0:7]       |     K28    |     K60     |     K92    |     K124    |  v[28]    ||    Reg 7 [0:7]       |     K28    |     K60     |  v[28]
-    // Reg 7 [8:15]      |     K29    |     K61     |     K93    |     K125    |  v[29]    ||    Reg 7 [8:15]      |     K29    |     K61     |  v[29]
-    // Reg 7 [16:23]     |     K30    |     K62     |     K94    |     K126    |  v[30]    ||    Reg 7 [16:23]     |     K30    |     K62     |  v[30]
-    // Reg 7 [24:31]     |     K31    |     K63     |     K95    |     K127    |  v[31]    ||    Reg 7 [24:31]     |     K31    |     K63     |  v[31]
+    // Size              |   BLOCK_N  |   BLOCK_N   |   BLOCK_N  |   BLOCK_N   |           ||    Size              |   BLOCK_N  |   BLOCK_N   |        |
+    // N                 | 0  ...  15 |  0  ...  15 | 0  ...  15 |  0  ...  15 | Vector    ||    N                 | 0  ...  31 |  0  ...  31 | Vector |
+    // Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47 | 48  ...  63 | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|
+    // Register Element  |------------|-------------|------------|-------------|-----------||    Register Element  |------------|-------------|--------|
+    // Reg 0 [0:7]       |     K0     |     K16     |     K32    |     K48     |  v[0]     ||    Reg 0 [0:7]       |     K0     |     K16     |  v[0]  |
+    // Reg 0 [8:15]      |     K1     |     K17     |     K33    |     K49     |  v[1]     ||    Reg 0 [8:15]      |     K1     |     K17     |  v[1]  |
+    // Reg 0 [16:23]     |     K2     |     K18     |     K34    |     K50     |  v[2]     ||    Reg 0 [16:23]     |     K2     |     K18     |  v[2]  |
+    // Reg 0 [24:31]     |     K3     |     K19     |     K35    |     K51     |  v[3]     ||    Reg 0 [24:31]     |     K3     |     K19     |  v[3]  |
+    // Reg 1 [0:7]       |     K4     |     K20     |     K36    |     K52     |  v[4]     ||    Reg 1 [0:7]       |     K4     |     K20     |  v[4]  |
+    // Reg 1 [8:15]      |     K5     |     K21     |     K37    |     K53     |  v[5]     ||    Reg 1 [8:15]      |     K5     |     K21     |  v[5]  |
+    // Reg 1 [16:23]     |     K6     |     K22     |     K38    |     K54     |  v[6]     ||    Reg 1 [16:23]     |     K6     |     K22     |  v[6]  |
+    // Reg 1 [24:31]     |     K7     |     K23     |     K39    |     K55     |  v[7]     ||    Reg 1 [24:31]     |     K7     |     K23     |  v[7]  |
+    // Reg 2 [0:7]       |     K8     |     K24     |     K40    |     K56     |  v[8]     ||    Reg 2 [0:7]       |     K8     |     K24     |  v[8]  |
+    // Reg 2 [8:15]      |     K9     |     K25     |     K41    |     K57     |  v[9]     ||    Reg 2 [8:15]      |     K9     |     K25     |  v[9]  |
+    // Reg 2 [16:23]     |     K10    |     K26     |     K42    |     K58     |  v[10]    ||    Reg 2 [16:23]     |     K10    |     K26     |  v[10] |
+    // Reg 2 [24:31]     |     K11    |     K27     |     K43    |     K59     |  v[11]    ||    Reg 2 [24:31]     |     K11    |     K27     |  v[11] |
+    // Reg 3 [0:7]       |     K12    |     K28     |     K44    |     K60     |  v[12]    ||    Reg 3 [0:7]       |     K12    |     K28     |  v[12] |
+    // Reg 3 [8:15]      |     K13    |     K29     |     K45    |     K61     |  v[13]    ||    Reg 3 [8:15]      |     K13    |     K29     |  v[13] |
+    // Reg 3 [16:23]     |     K14    |     K30     |     K46    |     K62     |  v[14]    ||    Reg 3 [16:23]     |     K14    |     K30     |  v[14] |
+    // Reg 3 [24:31]     |     K15    |     K31     |     K47    |     K63     |  v[15]    ||    Reg 3 [24:31]     |     K15    |     K31     |  v[15] |
+    // Reg 4 [0:7]       |     K64    |     K80     |     K96    |     K112    |  v[16]    ||    Reg 4 [0:7]       |     K32    |     K48     |  v[16] |
+    // Reg 4 [8:15]      |     K65    |     K81     |     K97    |     K113    |  v[17]    ||    Reg 4 [8:15]      |     K33    |     K49     |  v[17] |
+    // Reg 4 [16:23]     |     K66    |     K82     |     K98    |     K114    |  v[18]    ||    Reg 4 [16:23]     |     K34    |     K50     |  v[18] |
+    // Reg 4 [24:31]     |     K67    |     K83     |     K99    |     K115    |  v[19]    ||    Reg 4 [24:31]     |     K35    |     K51     |  v[19] |
+    // Reg 5 [0:7]       |     K68    |     K84     |     K100   |     K116    |  v[20]    ||    Reg 5 [0:7]       |     K36    |     K52     |  v[20] |
+    // Reg 5 [8:15]      |     K69    |     K85     |     K101   |     K117    |  v[21]    ||    Reg 5 [8:15]      |     K37    |     K53     |  v[21] |
+    // Reg 5 [16:23]     |     K70    |     K86     |     K102   |     K118    |  v[22]    ||    Reg 5 [16:23]     |     K38    |     K54     |  v[22] |
+    // Reg 5 [24:31]     |     K71    |     K87     |     K103   |     K119    |  v[23]    ||    Reg 5 [24:31]     |     K39    |     K55     |  v[23] |
+    // Reg 6 [0:7]       |     K72    |     K88     |     K104   |     K120    |  v[24]    ||    Reg 6 [0:7]       |     K40    |     K56     |  v[24] |
+    // Reg 6 [8:15]      |     K73    |     K89     |     K105   |     K121    |  v[25]    ||    Reg 6 [8:15]      |     K41    |     K57     |  v[25] |
+    // Reg 6 [16:23]     |     K74    |     K90     |     K106   |     K122    |  v[26]    ||    Reg 6 [16:23]     |     K42    |     K58     |  v[26] |
+    // Reg 6 [24:31]     |     K75    |     K91     |     K107   |     K123    |  v[27]    ||    Reg 6 [24:31]     |     K43    |     K59     |  v[27] |
+    // Reg 7 [0:7]       |     K76    |     K92     |     K108   |     K124    |  v[28]    ||    Reg 7 [0:7]       |     K44    |     K60     |  v[28] |
+    // Reg 7 [8:15]      |     K77    |     K93     |     K109   |     K125    |  v[29]    ||    Reg 7 [8:15]      |     K45    |     K61     |  v[29] |
+    // Reg 7 [16:23]     |     K78    |     K94     |     K110   |     K126    |  v[30]    ||    Reg 7 [16:23]     |     K46    |     K62     |  v[30] |
+    // Reg 7 [24:31]     |     K79    |     K95     |     K111   |     K127    |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |
     // clang-format on
 
-    // Here we want to load a BLOCK_K x BLOCK_N block of data.
-    static constexpr uint32_t VW = vectorSize(BFragT{});
+    static constexpr int32_t WAVE_SIZE = 64;
+
+    // Here we want to load from cols of B in chunks of 16 elements each.
+    static constexpr uint32_t chunk_size = 16;
+
+    // each chunk is separated by an offset
+    static constexpr uint32_t chunk_offset = chunk_size * WAVE_SIZE / BLOCK_N; // 32 or 64
 
     // To start the loading process, let's visualize in 2D coords.
     // Each thread will load 32 elements.
     // We need to know where they start, and where the next elements are.
-    auto startCoord2D = std::make_pair((threadIdx.x / BLOCK_N) * VW, // Row
-                                       threadIdx.x % BLOCK_N);       // Col
+    auto startCoord2D =
+        std::make_pair((threadIdx.x / BLOCK_N) * chunk_size, // Row {0, 16} |  {0, 16, 32, 48}
+                       threadIdx.x % BLOCK_N);               // Col {0-31}  |  {0-15}
 
     // Flatten to 1D col_major offsets.
     auto col_major = [](auto const& coord, auto ld) { return coord.first + coord.second * ld; };
 
-    auto startOffset = col_major(startCoord2D, BLOCK_K);
+    // auto minorStepCoord2D = std::make_pair(1u, 0u);       // read cols
+    auto majorStepCoord2D = std::make_pair(chunk_offset, 0); // read a chunk from a col
 
-    auto const* fragPtr = reinterpret_cast<BFragT const*>(input_ptr + startOffset);
-    return *fragPtr;
+    // BLOCK_K is a stride in B matrix
+    auto startOffset = col_major(startCoord2D, BLOCK_K);
+    // auto kMinorOffset = col_major(minorStepCoord2D, BLOCK_K);
+    auto kMajorOffset = col_major(majorStepCoord2D, BLOCK_K);
+
+    using BRawT        = typename scalar_type<BFragT>::type;
+    using BScalarFragT = vector_type<BRawT, chunk_size>::type;
+
+    union
+    {
+        BFragT frag;
+        BScalarFragT chunks[2];
+    } fragB{};
+
+    auto* fragPtr   = reinterpret_cast<BScalarFragT const*>(input_ptr + startOffset);
+    fragB.chunks[0] = *fragPtr;
+    fragPtr         = reinterpret_cast<BScalarFragT const*>(input_ptr + startOffset + kMajorOffset);
+    fragB.chunks[1] = *fragPtr;
+
+    return fragB.frag;
+}
+
+// Define a load function for scaled B blocks:
+// Size: (BLOCK_K x BLOCK_N)
+// ASSUMPTION:
+// - The scale inputs distributed across 64 lanes.
+template <typename BType,
+          typename BFragT,
+          typename ScaleType,
+          typename ScaleFragT,
+          int32_t BLOCK_K,
+          int32_t BLOCK_N,
+          int32_t BLOCK_X>
+__device__ BFragT load_mx_B_col_major(BType const* input_ptr,
+                                      ScaleType const* scale_ptr,
+                                      ScaleFragT& fragX)
+
+{
+    // clang-format off
+    // Register Mapping for 128x16:                                                                              ||    Register Mapping for 64x32:
+    // Size              |   BLOCK_N  |   BLOCK_N   |          |   BLOCK_N  |   BLOCK_N   |          |           ||    Size              |   BLOCK_N  |   BLOCK_N   |        |          |
+    // N                 | 0  ...  15 |  0  ...  15 |          | 0  ...  15 |  0  ...  15 |          | Vector    ||    N                 | 0  ...  31 |  0  ...  31 | Vector |          |
+    // Thread Id         | 0  ...  15 | 16  ...  31 |  Scale   | 32  ... 47 | 48  ...  63 |  Scale   | Element   ||    Thread Id         | 0  ...  31 | 32  ...  63 | Element|  Scale   |
+    // Register Element   ------------ ------------- ----------|------------ ------------- ----------|-----------||    Register Element  |------------|-------------|--------|----------|
+    // Reg 0 [0:7]       |     K0     |     K16     |  x(0,N)  |     K32    |     K48     |  x(1,N)  |  v[0]     ||    Reg 0 [0:7]       |     K0     |     K16     |  v[0]  |  x(0,N)  |
+    // Reg 0 [8:15]      |     K1     |     K17     |  x(0,N)  |     K33    |     K49     |  x(1,N)  |  v[1]     ||    Reg 0 [8:15]      |     K1     |     K17     |  v[1]  |  x(0,N)  |
+    // Reg 0 [16:23]     |     K2     |     K18     |  x(0,N)  |     K34    |     K50     |  x(1,N)  |  v[2]     ||    Reg 0 [16:23]     |     K2     |     K18     |  v[2]  |  x(0,N)  |
+    // Reg 0 [24:31]     |     K3     |     K19     |  x(0,N)  |     K35    |     K51     |  x(1,N)  |  v[3]     ||    Reg 0 [24:31]     |     K3     |     K19     |  v[3]  |  x(0,N)  |
+    // Reg 1 [0:7]       |     K4     |     K20     |  x(0,N)  |     K36    |     K52     |  x(1,N)  |  v[4]     ||    Reg 1 [0:7]       |     K4     |     K20     |  v[4]  |  x(0,N)  |
+    // Reg 1 [8:15]      |     K5     |     K21     |  x(0,N)  |     K37    |     K53     |  x(1,N)  |  v[5]     ||    Reg 1 [8:15]      |     K5     |     K21     |  v[5]  |  x(0,N)  |
+    // Reg 1 [16:23]     |     K6     |     K22     |  x(0,N)  |     K38    |     K54     |  x(1,N)  |  v[6]     ||    Reg 1 [16:23]     |     K6     |     K22     |  v[6]  |  x(0,N)  |
+    // Reg 1 [24:31]     |     K7     |     K23     |  x(0,N)  |     K39    |     K55     |  x(1,N)  |  v[7]     ||    Reg 1 [24:31]     |     K7     |     K23     |  v[7]  |  x(0,N)  |
+    // Reg 2 [0:7]       |     K8     |     K24     |  x(0,N)  |     K40    |     K56     |  x(1,N)  |  v[8]     ||    Reg 2 [0:7]       |     K8     |     K24     |  v[8]  |  x(0,N)  |
+    // Reg 2 [8:15]      |     K9     |     K25     |  x(0,N)  |     K41    |     K57     |  x(1,N)  |  v[9]     ||    Reg 2 [8:15]      |     K9     |     K25     |  v[9]  |  x(0,N)  |
+    // Reg 2 [16:23]     |     K10    |     K26     |  x(0,N)  |     K42    |     K58     |  x(1,N)  |  v[10]    ||    Reg 2 [16:23]     |     K10    |     K26     |  v[10] |  x(0,N)  |
+    // Reg 2 [24:31]     |     K11    |     K27     |  x(0,N)  |     K43    |     K59     |  x(1,N)  |  v[11]    ||    Reg 2 [24:31]     |     K11    |     K27     |  v[11] |  x(0,N)  |
+    // Reg 3 [0:7]       |     K12    |     K28     |  x(0,N)  |     K44    |     K60     |  x(1,N)  |  v[12]    ||    Reg 3 [0:7]       |     K12    |     K28     |  v[12] |  x(0,N)  |
+    // Reg 3 [8:15]      |     K13    |     K29     |  x(0,N)  |     K45    |     K61     |  x(1,N)  |  v[13]    ||    Reg 3 [8:15]      |     K13    |     K29     |  v[13] |  x(0,N)  |
+    // Reg 3 [16:23]     |     K14    |     K30     |  x(0,N)  |     K46    |     K62     |  x(1,N)  |  v[14]    ||    Reg 3 [16:23]     |     K14    |     K30     |  v[14] |  x(0,N)  |
+    // Reg 3 [24:31]     |     K15    |     K31     |  x(0,N)  |     K47    |     K63     |  x(1,N)  |  v[15]    ||    Reg 3 [24:31]     |     K15    |     K31     |  v[15] |  x(0,N)  |
+    // Reg 4 [0:7]       |     K64    |     K80     |  x(2,N)  |     K96    |     K112    |  x(3,N)  |  v[16]    ||    Reg 4 [0:7]       |     K32    |     K48     |  v[16] |  x(1,N)  |
+    // Reg 4 [8:15]      |     K65    |     K81     |  x(2,N)  |     K97    |     K113    |  x(3,N)  |  v[17]    ||    Reg 4 [8:15]      |     K33    |     K49     |  v[17] |  x(1,N)  |
+    // Reg 4 [16:23]     |     K66    |     K82     |  x(2,N)  |     K98    |     K114    |  x(3,N)  |  v[18]    ||    Reg 4 [16:23]     |     K34    |     K50     |  v[18] |  x(1,N)  |
+    // Reg 4 [24:31]     |     K67    |     K83     |  x(2,N)  |     K99    |     K115    |  x(3,N)  |  v[19]    ||    Reg 4 [24:31]     |     K35    |     K51     |  v[19] |  x(1,N)  |
+    // Reg 5 [0:7]       |     K68    |     K84     |  x(2,N)  |     K100   |     K116    |  x(3,N)  |  v[20]    ||    Reg 5 [0:7]       |     K36    |     K52     |  v[20] |  x(1,N)  |
+    // Reg 5 [8:15]      |     K69    |     K85     |  x(2,N)  |     K101   |     K117    |  x(3,N)  |  v[21]    ||    Reg 5 [8:15]      |     K37    |     K53     |  v[21] |  x(1,N)  |
+    // Reg 5 [16:23]     |     K70    |     K86     |  x(2,N)  |     K102   |     K118    |  x(3,N)  |  v[22]    ||    Reg 5 [16:23]     |     K38    |     K54     |  v[22] |  x(1,N)  |
+    // Reg 5 [24:31]     |     K71    |     K87     |  x(2,N)  |     K103   |     K119    |  x(3,N)  |  v[23]    ||    Reg 5 [24:31]     |     K39    |     K55     |  v[23] |  x(1,N)  |
+    // Reg 6 [0:7]       |     K72    |     K88     |  x(2,N)  |     K104   |     K120    |  x(3,N)  |  v[24]    ||    Reg 6 [0:7]       |     K40    |     K56     |  v[24] |  x(1,N)  |
+    // Reg 6 [8:15]      |     K73    |     K89     |  x(2,N)  |     K105   |     K121    |  x(3,N)  |  v[25]    ||    Reg 6 [8:15]      |     K41    |     K57     |  v[25] |  x(1,N)  |
+    // Reg 6 [16:23]     |     K74    |     K90     |  x(2,N)  |     K106   |     K122    |  x(3,N)  |  v[26]    ||    Reg 6 [16:23]     |     K42    |     K58     |  v[26] |  x(1,N)  |
+    // Reg 6 [24:31]     |     K75    |     K91     |  x(2,N)  |     K107   |     K123    |  x(3,N)  |  v[27]    ||    Reg 6 [24:31]     |     K43    |     K59     |  v[27] |  x(1,N)  |
+    // Reg 7 [0:7]       |     K76    |     K92     |  x(2,N)  |     K108   |     K124    |  x(3,N)  |  v[28]    ||    Reg 7 [0:7]       |     K44    |     K60     |  v[28] |  x(1,N)  |
+    // Reg 7 [8:15]      |     K77    |     K93     |  x(2,N)  |     K109   |     K125    |  x(3,N)  |  v[29]    ||    Reg 7 [8:15]      |     K45    |     K61     |  v[29] |  x(1,N)  |
+    // Reg 7 [16:23]     |     K78    |     K94     |  x(2,N)  |     K110   |     K126    |  x(3,N)  |  v[30]    ||    Reg 7 [16:23]     |     K46    |     K62     |  v[30] |  x(1,N)  |
+    // Reg 7 [24:31]     |     K79    |     K95     |  x(2,N)  |     K111   |     K127    |  x(3,N)  |  v[31]    ||    Reg 7 [24:31]     |     K47    |     K63     |  v[31] |  x(1,N)  |
+
+    // clang-format on
+    static constexpr uint32_t VW = vectorSize(BFragT{});
+    static_assert(VW == BLOCK_X, "Fragment size must be equal to BLOCK_X");
+
+    // To start the loading process, let's visualize in 2D coords.
+    // Each thread will load 1 element
+    // We need to know where to start
+    auto startCoord2D = std::make_pair((threadIdx.x / BLOCK_N) * VW / BLOCK_X, // Row
+                                       threadIdx.x % BLOCK_N);                 // Col
+
+    // Flatten to 1D col_major offsets.
+    auto col_major = [](auto const& coord, auto ld) { return coord.first + coord.second * ld; };
+
+    auto startOffset = col_major(startCoord2D, BLOCK_K / BLOCK_X);
+
+    // obtain 8-bit exponent
+    fragX = utils::get_exponent_value(scale_ptr[startOffset]) & 0xFF;
+
+    return load_B_col_major<BType, BFragT, BLOCK_K, BLOCK_N>(input_ptr);
 }
 
 // Define a store function for C
@@ -309,6 +617,129 @@ struct store_C_col_major<CType, CFragT, 32, 32>
     }
 };
 
+// Define a store function for C
+// Size: (BLOCK_M x BLOCK_N)
+// ASSUMPTION:
+// - We want contiguous BLOCK_N sized row neighbors in register.
+// - Data is in row major format
+template <typename CType, typename CFragT, int32_t BLOCK_M, int32_t BLOCK_N>
+struct store_C_row_major;
+
+// Here we want to store a 16x16 block of data.
+//
+// Size              |   BLOCK_N  |   BLOCK_N   |   BLOCK_N   |   BLOCK_N   |
+// N                 | 0  ...  15 |  0  ...  15 | 0  ...  15  |  0  ...  15 |
+// Thread Id         | 0  ...  15 | 16  ...  31 | 32  ... 47  | 48  ...  63 | Vector
+// Register Element   ------------ ------------- ------------ -------------- Element
+// Reg0              |     M0     |     M4      |     M8      |     M12     | v[0]
+// Reg1              |     M1     |     M5      |     M9      |     M13     | v[1]
+// Reg2              |     M2     |     M6      |     M10     |     M14     | v[2]
+// Reg3              |     M3     |     M7      |     M11     |     M15     | v[3]
+template <typename CType, typename CFragT>
+struct store_C_row_major<CType, CFragT, 16, 16>
+{
+    __device__ void operator()(CType* output, CFragT cFrag)
+    {
+        static constexpr uint32_t VW  = vectorSize(cFrag); // 4
+        static constexpr uint32_t Dim = 16;
+
+        // Each thread will load 4 elements.
+        // We need to know where they start, and where the next elements are.
+        auto startCoord2D = std::make_pair((threadIdx.x / Dim) * VW, // Row
+                                           threadIdx.x % Dim);       // Col
+        auto stepCoord2D  = std::make_pair(1u, 0u);
+
+        // Flatten to 1D row_major offsets.
+        auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
+
+        auto startOffset = row_major(startCoord2D, 16);
+        auto kOffset     = row_major(stepCoord2D, 16);
+
+        auto* fragPtr = reinterpret_cast<CFragT*>(output + startOffset);
+        *fragPtr      = cFrag;
+
+        // If you notice carefully, kOffset != 1.
+        // This means the following is vector is updated with 4 non-contiguous offsets,
+        // which the compiler will separate into 4 different global_store_dword instructions.
+        output[startOffset]               = cFrag[0]; // v[0] = Reg 0
+        output[startOffset + kOffset]     = cFrag[1]; // v[1] = Reg 1
+        output[startOffset + 2 * kOffset] = cFrag[2]; // v[2] = Reg 2
+        output[startOffset + 3 * kOffset] = cFrag[3]; // v[3] = Reg 3
+    }
+};
+
+// Here we want to store a 32x32 block of data.
+// Register Mapping:
+
+// Size              |   BLOCK_N  |   BLOCK_N   |
+// N                 | 0  ...  31 |  0  ...  31 |
+// Thread Id         | 0  ...  31 | 32  ...  63 | Vector
+// Register Element   ------------ -------------  Element
+// Reg0              |     M0     |     M4      | v[0]
+// Reg1              |     M1     |     M5      | v[1]
+// Reg2              |     M2     |     M6      | v[2]
+// Reg3              |     M3     |     M7      | v[3]
+//                    ____________ _____________
+// Reg4              |     M8     |     M12     | v[4]
+// Reg5              |     M9     |     M13     | v[5]
+// Reg6              |     M10    |     M14     | v[6]
+// Reg7              |     M11    |     M15     | v[7]
+//                    ____________ _____________
+// Reg8              |     M16    |     M20     | v[8]
+// Reg9              |     M17    |     M21     | v[9]
+// Reg10             |     M18    |     M22     | v[10]
+// Reg11             |     M19    |     M23     | v[11]
+//                    ____________ _____________
+// Reg12             |     M24    |     M28     | v[12]
+// Reg13             |     M25    |     M29     | v[13]
+// Reg14             |     M26    |     M30     | v[14]
+// Reg15             |     M27    |     M31     | v[15]
+
+template <typename CType, typename CFragT>
+struct store_C_row_major<CType, CFragT, 32, 32>
+{
+    __device__ void operator()(CType* output, CFragT cFrag)
+    {
+        static constexpr uint32_t WAVE_SIZE      = 64;
+        static constexpr uint32_t VW             = 4;                   // This VW is per 'chunk'
+        static constexpr uint32_t Dim            = 32;                  // BLOCK_N
+        static constexpr uint32_t M_PER_VW_CHUNK = VW * WAVE_SIZE / 32; // 8
+
+        auto startCoord2D = std::make_pair((threadIdx.x / Dim) * VW, // Row
+                                           threadIdx.x % Dim);       // Col
+
+        // Minor step for each 'chunk'
+        auto minorStepCoord2D = std::make_pair(1u, 0u);
+
+        // Major step between 'chunks'
+        auto majorStepCoord2D = std::make_pair(M_PER_VW_CHUNK, 0);
+
+        // Flatten to 1D row_major offsets.
+        auto row_major = [](auto const& coord, auto ld) { return coord.first * ld + coord.second; };
+
+        auto startOffset  = row_major(startCoord2D, 32);
+        auto kMinorOffset = row_major(minorStepCoord2D, 32);
+        auto kMajorOffset = row_major(majorStepCoord2D, 32);
+
+        output[startOffset]                                       = cFrag[0];  // v[0] = Reg 0
+        output[startOffset + kMinorOffset]                        = cFrag[1];  // v[1] = Reg 1
+        output[startOffset + 2 * kMinorOffset]                    = cFrag[2];  // v[2] = Reg 2
+        output[startOffset + 3 * kMinorOffset]                    = cFrag[3];  // v[3] = Reg 3
+        output[startOffset + kMajorOffset]                        = cFrag[4];  // v[4] = Reg 4
+        output[startOffset + kMajorOffset + kMinorOffset]         = cFrag[5];  // v[5] = Reg 5
+        output[startOffset + kMajorOffset + 2 * kMinorOffset]     = cFrag[6];  // v[6] = Reg 6
+        output[startOffset + kMajorOffset + 3 * kMinorOffset]     = cFrag[7];  // v[7] = Reg 7
+        output[startOffset + 2 * kMajorOffset]                    = cFrag[8];  // v[8] = Reg 8
+        output[startOffset + 2 * kMajorOffset + kMinorOffset]     = cFrag[9];  // v[9] = Reg 9
+        output[startOffset + 2 * kMajorOffset + 2 * kMinorOffset] = cFrag[10]; // v[10] = Reg 10
+        output[startOffset + 2 * kMajorOffset + 3 * kMinorOffset] = cFrag[11]; // v[11] = Reg 11
+        output[startOffset + 3 * kMajorOffset]                    = cFrag[12]; // v[12] = Reg 12
+        output[startOffset + 3 * kMajorOffset + kMinorOffset]     = cFrag[13]; // v[13] = Reg 13
+        output[startOffset + 3 * kMajorOffset + 2 * kMinorOffset] = cFrag[14]; // v[14] = Reg 14
+        output[startOffset + 3 * kMajorOffset + 3 * kMinorOffset] = cFrag[15]; // v[15] = Reg 15
+    }
+};
+
 template <typename AType,
           typename BType,
           typename CType,
@@ -352,6 +783,61 @@ __global__ void matmul(const AType* a, const BType* b, CType* c)
     auto storeC = store_C_col_major<CType, CFragT, BLOCK_M, BLOCK_N>{};
     storeC(c, fragC);
 }
+
+template <typename AType,
+          typename BType,
+          typename ScaleType,
+          typename CType,
+          typename AccType,
+          int32_t BLOCK_M,
+          int32_t BLOCK_N,
+          int32_t BLOCK_K,
+          int32_t BLOCK_X>
+__global__ void
+matmul(const AType* a, const ScaleType* xa, const BType* b, const ScaleType* xb, CType* c)
+{
+    constexpr int WAVE_SIZE = 64;
+    assert(threadIdx.x < WAVE_SIZE);
+    assert(blockDim.x == 1 && blockDim.y == 1 && blockDim.z == 1);
+
+    using AFragT        = vector_type<AType, BLOCK_M * BLOCK_K / WAVE_SIZE>::type;
+    using BFragT        = vector_type<BType, BLOCK_K * BLOCK_N / WAVE_SIZE>::type;
+    using CFragT        = vector_type<CType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using AccumFragT    = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>;
+    using RawAccumFragT = vector_type<AccType, BLOCK_M * BLOCK_N / WAVE_SIZE>::type;
+    using ScaleFragT    = int32_t;
+
+    // Create frags
+    auto fragA   = AFragT{};
+    auto fragB   = BFragT{};
+    auto fragC   = CFragT{};
+    auto fragAcc = AccumFragT{0};
+    auto fragXa  = ScaleFragT{0};
+    auto fragXb  = ScaleFragT{0};
+
+    // Load the inputs.
+    // A = col major, BLOCK_M x BLOCK_K
+    fragA = load_mx_A_row_major<AType, AFragT, ScaleType, ScaleFragT, BLOCK_M, BLOCK_K, BLOCK_X>(
+        a, xa, fragXa);
+
+    // B = col major, BLOCK_K x BLOCK_N
+    fragB = load_mx_B_col_major<BType, BFragT, ScaleType, ScaleFragT, BLOCK_K, BLOCK_N, BLOCK_X>(
+        b, xb, fragXb);
+
+    // Scaled Matrix multiply-accumulate using MFMA units
+    // Accumulation intermediate = BLOCK_M x BLOCK_N
+    mfma_type_selector<AFragT, BFragT, AccumFragT, BLOCK_M, BLOCK_N>{}(
+        fragA, fragXa, fragB, fragXb, fragAcc);
+
+    for(int i = 0; i < vectorSize(fragC); ++i)
+    {
+        fragC[i] = type_convert<CType>(fragAcc.template AsType<RawAccumFragT>()[Number<0>{}][i]);
+    }
+
+    auto storeC = store_C_row_major<CType, CFragT, BLOCK_M, BLOCK_N>{};
+    storeC(c, fragC);
+}
+
 /**
  * @brief Structure to hold dimension parameters for GEMM tensors.
  *
@@ -373,6 +859,225 @@ struct GemmParams
     ck::index_t StrideC = -1;
 };
 
+namespace mxmfma_test {
+template <typename ADataType, typename BDataType, typename ScaleType, typename CDataType>
+void RunHostGEMM(const Tensor<ADataType>& A,
+                 const Tensor<ScaleType>& a_scales,
+                 const Tensor<BDataType>& B,
+                 const Tensor<ScaleType>& b_scales,
+                 Tensor<CDataType>& C)
+{
+    using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+    using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceMXGemm<ADataType,
+                                                                              BDataType,
+                                                                              CDataType,
+                                                                              float,
+                                                                              ScaleType,
+                                                                              PassThrough,
+                                                                              PassThrough,
+                                                                              PassThrough,
+                                                                              float,
+                                                                              float>;
+    auto ref_gemm               = ReferenceGemmInstance{};
+    auto ref_invoker            = ref_gemm.MakeInvoker();
+
+    auto ref_argument = ref_gemm.MakeArgument(
+        A, a_scales, B, b_scales, C, PassThrough{}, PassThrough{}, PassThrough{});
+
+    ref_invoker.Run(ref_argument);
+}
+
+template <typename KernelType,
+          typename ADataType,
+          typename BDataType,
+          typename ScaleType,
+          typename CDataType>
+bool RunDeviceGEMM(KernelType kernel,
+                   const Tensor<ADataType>& A,
+                   const Tensor<ScaleType>& a_scales,
+                   const Tensor<BDataType>& B,
+                   const Tensor<ScaleType>& b_scales,
+                   Tensor<CDataType>& C)
+{
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * A.mDesc.GetElementSpaceSize());
+    DeviceMem a_scales_device_buf(sizeof(ScaleType) * a_scales.mDesc.GetElementSpaceSize());
+    DeviceMem b_n_k_device_buf(sizeof(BDataType) * B.mDesc.GetElementSpaceSize());
+    DeviceMem b_scales_device_buf(sizeof(ScaleType) * b_scales.mDesc.GetElementSpaceSize());
+    DeviceMem c_m_n_device_buf(sizeof(CDataType) * C.mDesc.GetElementSpaceSize());
+
+    a_m_k_device_buf.ToDevice(A.mData.data());
+    a_scales_device_buf.ToDevice(a_scales.mData.data());
+    b_n_k_device_buf.ToDevice(B.mData.data());
+    b_scales_device_buf.ToDevice(b_scales.mData.data());
+    kernel<<<1, 64>>>(static_cast<const ADataType*>(a_m_k_device_buf.GetDeviceBuffer()),
+                      static_cast<const ScaleType*>(a_scales_device_buf.GetDeviceBuffer()),
+                      static_cast<const BDataType*>(b_n_k_device_buf.GetDeviceBuffer()),
+                      static_cast<const ScaleType*>(b_scales_device_buf.GetDeviceBuffer()),
+                      static_cast<CDataType*>(c_m_n_device_buf.GetDeviceBuffer()));
+    c_m_n_device_buf.FromDevice(C.mData.data());
+
+    return true;
+}
+
+template <typename DeviceMFMA,
+          typename ADataType,
+          typename BDataType,
+          typename ScaleType,
+          typename CDataType,
+          typename ALayout,
+          typename BLayout,
+          typename CLayout,
+          index_t BLOCK_M,
+          index_t BLOCK_N,
+          index_t BLOCK_K,
+          index_t BLOCK_X>
+struct TestMXMFMA
+{
+    auto PrepareGemmTensors(const GemmParams& params, index_t init)
+    {
+        auto f_host_tensor_descriptor =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({stride, 1}));
+                }
+                else
+                {
+                    return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                                std::vector<std::size_t>({1, stride}));
+                }
+            };
+
+        Tensor<ADataType> a_m_k(
+            f_host_tensor_descriptor(params.M, params.K, params.StrideA, ALayout{}));
+        Tensor<ScaleType> a_scales(
+            f_host_tensor_descriptor(params.M, params.K / BLOCK_X, params.K / BLOCK_X, ALayout{}));
+        Tensor<BDataType> b_n_k(
+            f_host_tensor_descriptor(params.K, params.N, params.StrideB, BLayout{}));
+        Tensor<ScaleType> b_scales(
+            f_host_tensor_descriptor(params.K / BLOCK_X, params.N, params.K / BLOCK_X, BLayout{}));
+        Tensor<CDataType> c_m_n_host_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+        Tensor<CDataType> c_m_n_device_result(
+            f_host_tensor_descriptor(params.M, params.N, params.StrideC, CLayout{}));
+
+        switch(init)
+        {
+        case 0:
+            a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0f});
+            a_scales.GenerateTensorValue(
+                GeneratorTensor_1<ScaleType>{ScaleType{0.015625f}}); // 1/64
+            // NOTE: not all numbers are representable in FP8, BF8, etc.
+            // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 16 18 20 20 20 22 24 24 24 26 28 28 28 30 32
+            b_n_k.GenerateTensorValue(GeneratorTensor_Sequential<BDataType, 1>{});
+            b_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{1.0f}});
+            break;
+        case 1:
+            // results in C = {K}
+            a_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1.0f});
+            a_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{512.0f}});
+            b_n_k.GenerateTensorValue(GeneratorTensor_1<BDataType>{1.0f});
+            b_scales.GenerateTensorValue(GeneratorTensor_1<ScaleType>{ScaleType{1.0f / 512}});
+            break;
+        case 2:
+            // expect small round off errors
+            a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
+            a_scales.GenerateTensorValue(
+                GeneratorTensor_2<ScaleType>{126, 129}); // scales: {0.5, 1, 2}
+
+            b_n_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{-2.0, 2.0});
+            b_scales.GenerateTensorValue(GeneratorTensor_2<ScaleType>{126, 129});
+            break;
+
+        case 3:
+            // expect small round off errors
+            a_m_k.GenerateTensorValue(GeneratorTensor_4<ADataType>(0, 1));
+            a_scales.GenerateTensorValue(
+                GeneratorTensor_2<ScaleType>{126, 129}); // scales: {0.5, 1, 2}
+            b_n_k.GenerateTensorValue(GeneratorTensor_4<BDataType>(0, 1));
+            b_scales.GenerateTensorValue(
+                GeneratorTensor_2<ScaleType>{126, 129}); //  scales: {0.5, 1, 2}
+            break;
+        default:
+            // all initial values are representable in FP8, BF8
+            a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 6}); // Z[-5,5]
+            a_scales.GenerateTensorValue(
+                GeneratorTensor_2<ScaleType>{122, 129});                    // scales: [1/32,..., 2]
+            b_n_k.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 6}); // Z[-5,5]
+            b_scales.GenerateTensorValue(
+                GeneratorTensor_2<ScaleType>{122, 129}); //  scales: [1/32,..., 2]
+
+            break;
+        }
+
+        return std::make_tuple(
+            a_m_k, a_scales, b_n_k, b_scales, c_m_n_host_result, c_m_n_device_result);
+    }
+
+    auto operator()(const DeviceMFMA& mfma_kernel, index_t init)
+    {
+        // Arrange
+        GemmParams params;
+        params.M = BLOCK_M;
+        params.N = BLOCK_N;
+        params.K = BLOCK_K;
+
+        auto f_get_default_stride = [](std::size_t row,
+                                       std::size_t col,
+                                       ck::index_t stride,
+                                       auto layout) {
+            if(stride == -1)
+            {
+                // give a chance if stride is -1, return a default packed stride
+                if constexpr(std::is_same_v<decltype(layout), ck::tensor_layout::gemm::RowMajor>)
+                {
+                    return static_cast<std::size_t>(col);
+                }
+                else
+                {
+                    return static_cast<std::size_t>(row);
+                }
+            }
+            else
+                return static_cast<std::size_t>(stride);
+        };
+
+        params.StrideA = f_get_default_stride(BLOCK_M, BLOCK_K, params.StrideA, ALayout{});
+        params.StrideB = f_get_default_stride(BLOCK_K, BLOCK_N, params.StrideB, BLayout{});
+        params.StrideC = f_get_default_stride(BLOCK_M, BLOCK_N, params.StrideC, CLayout{});
+
+        auto host_tensors = PrepareGemmTensors(params, init);
+
+        const Tensor<ADataType>& a        = std::get<0>(host_tensors);
+        const Tensor<ScaleType>& a_scales = std::get<1>(host_tensors);
+        const Tensor<BDataType>& b        = std::get<2>(host_tensors);
+        const Tensor<ScaleType>& b_scales = std::get<3>(host_tensors);
+        Tensor<CDataType>& c_host         = std::get<4>(host_tensors);
+        Tensor<CDataType>& c_device       = std::get<5>(host_tensors);
+
+        RunHostGEMM(a, a_scales, b, b_scales, c_host);
+
+        RunDeviceGEMM(mfma_kernel, a, a_scales, b, b_scales, c_device);
+
+        bool res = false;
+        if constexpr(std::is_same<CDataType, float>::value ||
+                     std::is_same<CDataType, half_t>::value)
+        {
+            res = ck::utils::check_err(c_device.mData, c_host.mData);
+        }
+        else
+        {
+            std::cout << "UNSUPPORTED CDataType" << std::endl;
+        }
+
+        return res;
+    }
+};
+
+} // namespace mxmfma_test
+
 namespace mfma_test {
 template <typename GemmInstance,
           typename ADataType,
@@ -489,9 +1194,6 @@ struct TestMFMA
 
     auto operator()(const DeviceMFMA& mfma_kernel, index_t init)
     {
-        std::cout << "ALayout = " << ALayout{}.name << ", BLayout = " << BLayout{}.name
-                  << ", CLayout = " << CLayout{}.name << std::endl;
-
         // Arrange
         GemmParams params;
         params.M = BLOCK_M;
@@ -541,7 +1243,8 @@ struct TestMFMA
                                                                                 CPUAccDataType,
                                                                                 PassThrough,
                                                                                 PassThrough,
-                                                                                PassThrough>;
+                                                                                PassThrough,
+                                                                                CPUAccDataType>;
 
         RunHostGEMM<ReferenceGemmInstance>(a, b, c_host, a_element_op, b_element_op, c_element_op);
 
@@ -552,7 +1255,6 @@ struct TestMFMA
                      std::is_same<CDataType, half_t>::value)
         {
             res = ck::utils::check_err(c_device.mData, c_host.mData);
-            std::cout << (res ? "SUCCESS" : "FAILURE") << std::endl;
         }
         else
         {