From 2ff7ac5abc566cd0c43853a57dbeaa1f3325fded Mon Sep 17 00:00:00 2001
From: Aviral Goel <aviral.goel@amd.com>
Date: Fri, 10 Apr 2026 11:17:11 -0400
Subject: [PATCH] CK: Remove 41 commented-out dead code blocks (~200 lines)
 (#6302)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Depends on #6300

## Summary

Remove 41 commented-out code blocks across 33 files in Composable
Kernel, totaling ~200 lines.

Identified using an automated dead code scanning skill (`ck-dead-code`)
with a calibrated two-stage pipeline:
1. **Pre-filter**: Keyword-based scan found 1,338 `//`-commented blocks.
Calibrated heuristics (trained on 50-sample expert classification)
reduced to 89 high-confidence candidates — 93% noise reduction.
2. **Expert triage**: LLM expert classified each block in context as
CODE_REMOVE, CODE_KEEP, or NOT_CODE.

| Classification | Count |
|---------------|-------|
| Removed (this PR) | 41 |
| Kept (debug helpers, alt configs, reference impls) | 32 |
| Not code (false positives) | 16 |

Removed blocks include: superseded implementations, old test data,
abandoned stubs, unreachable code, and buggy dead code.
---
 .../test/grouped_conv_fwd_multiple_d_v1.cpp   |   4 -
 .../test/grouped_conv_fwd_multiple_d_v2.cpp   |   4 -
 .../test/grouped_conv_fwd_multiple_d_v3.cpp   |   4 -
 .../test/grouped_conv_fwd_multiple_d_v4.cpp   |   4 -
 ...ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp |  10 -
 .../moe_gemm2_xdl_fp8.cpp                     |   3 -
 .../moe_gemm2_xdl_fp8_blockscale.cpp          |  10 -
 ...norm2d_rdquant_fwd_bf16_n1024_instance.cpp |   8 -
 ...norm2d_rdquant_fwd_fp16_n1024_instance.cpp |   8 -
 .../smoothquant_bf16_n1024_instance.cpp       |   8 -
 .../smoothquant_fp16_n1024_instance.cpp       |   8 -
 .../moe_smoothquant_bf16_n1024_instance.cpp   |   8 -
 .../moe_smoothquant_fp16_n1024_instance.cpp   |   8 -
 include/ck/host_utility/flush_cache.hpp       |  10 -
 ...n3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp |   7 -
 ...dlops_blockscale_b_preshuffle_selector.hpp |  26 -
 ...s_moe_blockscale_b_preshuffle_selector.hpp |  26 -
 ...roup_tensor_slice_transfer_direct_load.hpp |   6 -
 ...nsor_slice_transfer_gather_direct_load.hpp |   6 -
 ...d_contraction_multiple_d_wmma_cshuffle.hpp |   5 -
 ...emm_softmax_gemm_permute_wmma_cshuffle.hpp | 141 ---
 ...e_grouped_query_attention_forward_wmma.hpp | 141 ---
 ...ice_multi_query_attention_forward_wmma.hpp | 141 ---
 .../gridwise_gemm_multiple_d_xdl_cshuffle.hpp |   6 -
 .../gridwise_gemm_xdl_cshuffle_streamk_v3.hpp | 175 ----
 .../grid/gridwise_gemm_xdl_cshuffle_v3.hpp    |  39 -
 ...wise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp |  39 -
 .../gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp |  39 -
 ...ridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp |  76 --
 .../gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp |  39 -
 ..._gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp |  39 -
 ...m_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp |  39 -
 .../grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp |  39 -
 ...se_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp |  39 -
 .../gpu/grid/gridwise_moe_mx_gemm.hpp         | 806 ------------------
 .../gpu/grid/gridwise_moe_mx_gemm_bns.hpp     |  44 -
 .../tensor_operation/gpu/warp/wmma_gemm.hpp   |  12 +-
 ...ransform_contraction_to_gemm_arraybase.hpp |   5 -
 .../ck/utility/container_element_picker.hpp   |   6 -
 include/ck/utility/dynamic_buffer.hpp         |   6 -
 include/ck/utility/transpose_vectors.hpp      |  17 -
 include/ck/utility/workgroup_barrier.hpp      |  14 -
 .../core/arch/amd_buffer_addressing.hpp       |  16 -
 .../arch/amd_buffer_addressing_builtins.hpp   |  16 -
 include/ck_tile/core/container/array.hpp      |  20 -
 include/ck_tile/core/container/sequence.hpp   |  26 -
 .../container/statically_indexed_array.hpp    |  14 -
 .../ck_tile/core/container/thread_buffer.hpp  |  31 -
 include/ck_tile/core/container/tuple.hpp      |  10 -
 include/ck_tile/core/numeric/half.hpp         |  87 --
 include/ck_tile/core/numeric/int8.hpp         |  21 -
 include/ck_tile/core/tensor/sweep_tile.hpp    |   4 -
 .../ck_tile/core/tensor/tile_distribution.hpp |  39 -
 .../unary_element_wise_operation.hpp          |  63 --
 .../flatmm/kernel/grouped_flatmm_kernel.hpp   |  10 -
 .../ops/flatmm/kernel/moe_flatmm_kernel.hpp   |   7 -
 ...mm_pipeline_agmem_bgmem_creg_v1_policy.hpp |   4 -
 ...ec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp |   9 -
 .../moe_flatmm_pipeline_agmem_bgmem_creg.hpp  |  13 +-
 ...mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp |   4 -
 ...k_fmha_pipeline_qx_ks_vs_custom_policy.hpp |   3 -
 .../fused_moe/kernel/moe_sorting_kernel.hpp   | 138 +--
 .../pipeline/moe_sorting_pipeline.hpp         |   8 -
 ...block_gemm_areg_bsmem_creg_one_warp_v1.hpp |   3 -
 ...gemm_areg_bsmem_creg_v1_default_policy.hpp |  23 -
 ...gemm_areg_bsmem_creg_v2_default_policy.hpp |  23 -
 ...gemm_asmem_breg_creg_v1_default_policy.hpp |  23 -
 .../norm_reduce/block/block_norm_reduce.hpp   |  15 -
 .../ops/reduce/block/block_reduce2d.hpp       |  26 -
 .../gpu/gemm_streamk.hpp                      |  60 --
 ...e_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp |   6 -
 ..._streamk_f16_f16_f16_mk_kn_mn_instance.cpp |   3 -
 ...mm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp |  19 -
 ...gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp |  19 -
 profiler/src/profile_softmax.cpp              |   7 -
 .../block_swizzle_test/block_swizzle_test.cpp |  12 +-
 ...norm2d_rdquant_fwd_bf16_n1024_instance.cpp |   8 -
 ...norm2d_rdquant_fwd_fp16_n1024_instance.cpp |   8 -
 .../moe_smoothquant_bf16_n1024_instance.cpp   |   8 -
 .../moe_smoothquant_fp16_n1024_instance.cpp   |   8 -
 .../smoothquant_bf16_n1024_instance.cpp       |   8 -
 .../smoothquant_fp16_n1024_instance.cpp       |   8 -
 82 files changed, 22 insertions(+), 2883 deletions(-)

diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
index 863501cd0a..9895ed7e54 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v1.cpp
@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
index e748a29743..617c2318d5 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v2.cpp
@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
index a68fb53cba..84516b2577 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v3.cpp
@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));
diff --git a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
index 0262319c39..3490c38f6a 100644
--- a/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
+++ b/codegen/test/grouped_conv_fwd_multiple_d_v4.cpp
@@ -198,10 +198,6 @@ struct Epilogue
                                                               input_left_pads,
                                                               input_right_pads);
 
-        // auto res = rtc::from_gpu(out_dev);
-        // pass &= ck::utils::check_err(res, out_host, "Error: incorrect results!", 1e-5f, 1e-4f);
-        // assert(pass);
-
         // Simple check: this checks that the output from each instance matches the output from the
         // first instance
         CHECK(report(solution, check(rtc::from_gpu(out_dev))));
diff --git a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
index b0b2d29d98..2ceca3c877 100644
--- a/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
+++ b/example/60_gemm_multi_ABD/gemm_multi_ABD_xdl_multiply_bias_fastgelu_bf16_i8.cpp
@@ -238,16 +238,6 @@ int main(int argc, char* argv[])
 
         Tensor<B1DataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, B0Layout{}));
 
-#if 0
-        for(int n = 0; n < N; ++n)
-        {
-            for(int k = 0; k < K; ++k)
-            {
-                b_element_op(b_k_n(k, n), b0_k_n(k, n), b1_k_n(k, n));
-            }
-        }
-#endif
-
         using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<A0DataType,
                                                                                 B0DataType,
                                                                                 CShuffleDataType,
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
index 9f4cd13573..008edb533b 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
@@ -252,9 +252,6 @@ int main(int argc, char* argv[])
     Tensor<ck::index_t> expert_ids(HostTensorDescriptor({sorted_tile_num}, {1}));
     Tensor<ck::index_t> sorted_token_ids(HostTensorDescriptor({sorted_size}, {1}));
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
-    // max_token_id.mData[0] = valid_size;
-    // max_token_id.mData = {valid_size, 0, 2, 3, 4, 6, 8, 10, 12, 13};
-    // int eids[]         = {0, 0, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 7, 3, 3, 3};
     max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
     // int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
     for(int i = 0; i < sorted_tile_num; i++)
diff --git a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
index 552d3cd7b5..8ae97ef1c2 100644
--- a/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
+++ b/example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8_blockscale.cpp
@@ -261,16 +261,6 @@ int main(int argc, char* argv[])
     Tensor<ck::index_t> max_token_id(HostTensorDescriptor({1}));
 
     max_token_id.mData = {valid_size, 0, 1, 2, 3, 4, 5, 6, 7, 8};
-    // int eids[]         = {0, 1, 3, 3, 3};
-    //  int eids[]         = {0, 1, 2, 3, 4, 5, 6, 7}; //, 3, 3, 3}; // {2, 1, 1, 2, 2, 2, 1, 2}
-    // int eids[] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 3, 3, 3};
-    // int eids[]         = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    //                     1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-    //                     2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    //                     3, 3, 3, 3, 3, 3, 3, 3, 4, 4,
-    //                     5, 5, 5, 5, 6, 6, 6, 6, 7, 7,
-    //                     7, 7,
-    //                     3, 3, 3};
     for(int i = 0; i < sorted_tile_num; i++)
     {
         expert_ids.mData[i] = i / ck::math::integer_divide_ceil(valid_tile_num, experts);
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
index 8f4813a47e..ca49114844 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
diff --git a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
index e357d7e3ac..f754d8e959 100644
--- a/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
+++ b/example/ck_tile/11_add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
index 8a5e0c74a0..66f427247a 100644
--- a/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
index 9c08cf64f0..103f7281b0 100644
--- a/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
+++ b/example/ck_tile/12_smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
index 8c72b81dc1..56fcca3beb 100644
--- a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
index 6d7a5e7c1f..2462cd218e 100644
--- a/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
+++ b/example/ck_tile/14_moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/include/ck/host_utility/flush_cache.hpp b/include/ck/host_utility/flush_cache.hpp
index 2d051233e4..25084bae85 100644
--- a/include/ck/host_utility/flush_cache.hpp
+++ b/include/ck/host_utility/flush_cache.hpp
@@ -476,16 +476,6 @@ float launch_and_time_kernel_with_preprocess(const StreamConfig& stream_config,
             hip_check_error(hipGetLastError());
             // end real kernel
 
-            //             hip_check_error(hipEventRecord(stop, stream_config.stream_id_));
-            //             hip_check_error(hipEventSynchronize(stop));
-            //             float cur_time = 0;
-            //             hip_check_error(hipEventElapsedTime(&cur_time, start, stop));
-            // #if MEDIAN
-            //             times.insert(cur_time);
-            // #else
-            //             total_time += cur_time;
-            // #endif
-
 #if !defined(CK_USE_WMMA)
             if(ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
diff --git a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
index fd50e61f32..7ccebaf35a 100644
--- a/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
+++ b/include/ck/problem_transform/transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk.hpp
@@ -137,13 +137,6 @@ transform_forward_convolution3d_into_gemm_v4r4r4_ndhwc_kzyxc_ndhwk_pad(
         make_tuple(Sequence<0>{}, Sequence<1>{}),
         make_tuple(Sequence<0>{}, Sequence<1>{}));
 
-    // const auto out_grid_desc_gemmm_gemmn = transform_tensor_descriptor(
-    //     out_n_do_ho_wo_k_grid_desc,
-    //     make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)),
-    //                make_pass_through_transform(K)),
-    //     make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<3>{}),
-    //     make_tuple(Sequence<0>{}, Sequence<1>{}));
-
     return make_tuple(in_grid_desc_gemmk0_gemmm_gemmk1,
                       wei_grid_desc_gemmk0_gemmn_gemmk1,
                       out_grid_desc_gemmm_gemmn);
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp
index 8df23454a2..41ca5916cb 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_blockscale_b_preshuffle_selector.hpp
@@ -60,32 +60,6 @@ constexpr auto BlockGemmBlockScaleBPreshufflePipeline_Selector()
             NRepeat,
             KPack>{};
     }
-#if 0
-    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-    {
-        return BlockwiseGemmXdlops_pipeline_blockscale_bpreshuffle_v2<
-            BlkGemmPipeSche,
-            BlockSize,
-            ADataType,
-            BDataType,
-            ComputeDataType,
-            AccDataType,
-            ATileDesc,
-            BTileDesc,
-            AMmaTileDesc,
-            BMmaTileDesc,
-            ABlockTransferSrcScalarPerVector,
-            BBlockTransferSrcScalarPerVector,
-            MPerBlock,
-            NPerBlock,
-            KPerBlock,
-            MPerXDL,
-            NPerXDL,
-            MRepeat,
-            NRepeat,
-            KPack>{};
-    }
-#endif
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
diff --git a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
index 199c729f53..96bf5e81b7 100644
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_moe_blockscale_b_preshuffle_selector.hpp
@@ -93,32 +93,6 @@ constexpr auto BlockGemmBlockMoeScaleBPreshufflePipeline_Selector()
                 KPack>{};
         }
     }
-#if 0
-    else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v2)
-    {
-        return BlockwiseGemmXdlops_pipeline_moe_blockscale_bpreshuffle_v2<
-            BlkGemmPipeSche,
-            BlockSize,
-            ADataType,
-            BDataType,
-            ComputeDataType,
-            AccDataType,
-            ATileDesc,
-            BTileDesc,
-            AMmaTileDesc,
-            BMmaTileDesc,
-            ABlockTransferSrcScalarPerVector,
-            BBlockTransferSrcScalarPerVector,
-            MPerBlock,
-            NPerBlock,
-            KPerBlock,
-            MPerXDL,
-            NPerXDL,
-            MRepeat,
-            NRepeat,
-            KPack>{};
-    }
-#endif
     else if constexpr(BlkGemmPipelineVer == BlockGemmPipelineVersion::v3)
     {
         static_assert(MRepeat >= 4, "MRepeat should at least be 4 in BlockGemmPipelineVersion::v3");
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
index a31c9101a1..ade2839950 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_direct_load.hpp
@@ -144,12 +144,6 @@ struct ThreadGroupTensorSliceTransfer_DirectLoad
                       "When loading more than one element per thread at once, the contiguous "
                       "dimension must be the same between source and destination.");
 
-        // constexpr auto dword_bytes           = 4;
-        // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
-        // static_assert(bytes_per_thread_load == dword_bytes,
-        //               "Direct load transfer requires each thread to load exactly a single "
-        //               "DWORD of data.");
-
         static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
                           nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
                           nDim == ThreadClusterLengths::Size(),
diff --git a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
index 11043281ec..8c6e77bccd 100644
--- a/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
+++ b/include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_gather_direct_load.hpp
@@ -152,12 +152,6 @@ struct ThreadGroupTensorSliceTransfer_Gather_DirectLoad
                       "When loading more than one element per thread at once, the contiguous "
                       "dimension must be the same between source and destination.");
 
-        // constexpr auto dword_bytes           = 4;
-        // constexpr auto bytes_per_thread_load = ScalarPerVector * sizeof(SrcData);
-        // static_assert(bytes_per_thread_load == dword_bytes,
-        //               "Direct load transfer requires each thread to load exactly a single "
-        //               "DWORD of data.");
-
         static_assert(nDim == remove_cvref_t<SrcDesc>::GetNumOfDimension() &&
                           nDim == remove_cvref_t<DstDesc>::GetNumOfDimension() &&
                           nDim == ThreadClusterLengths::Size(),
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
index c64f2c42f3..69d8eef80a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_contraction_multiple_d_wmma_cshuffle.hpp
@@ -737,11 +737,6 @@ struct DeviceBatchedContractionMultipleD_Wmma_CShuffle
 
         // Batch Offset
         ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_;
-
-        // for checking vector load/store
-        // index_t MRaw_;
-        // index_t NRaw_;
-        // index_t KRaw_;
     };
 
     // Invoker
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
index 6b595c4dce..043adf5fc0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_wmma_cshuffle.hpp
@@ -1433,147 +1433,6 @@ struct DeviceBatchedGemmSoftmaxGemmPermute_Wmma_CShuffle
         // TODO: properly implement this check
         return true;
     }
-#if 0
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        if(ck::is_gfx11_supported())
-        {
-            if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc0 Type err");
-                return false;
-            }
-
-            if constexpr(!(is_same_v<Acc1DataType, float> || is_same_v<Acc1DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc1 Type err");
-                return false;
-            }
-        }
-        else
-        {
-            printf("DeviceOp: Arch err");
-            return false;
-        }
-
-        if(!GridwiseOp::CheckValidity(arg.a_grid_desc,
-                                      arg.b0_grid_desc,
-                                      arg.b1_grid_desc,
-                                      arg.c_grid_desc_m_n_,
-                                      arg.block_2_ctile_map_))
-        {
-            return false;
-        }
-
-        // Check if C permute dimension matches GEMM + GEMM shape
-        const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded
-
-        if(!(c_g == arg.batch_count_))
-        {
-            printf("DeviceOp: BatchCount err");
-            return false;
-        }
-
-        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
-        // vector is out of bounds
-        // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O
-        const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0];
-        const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1];
-        const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2];
-        const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3];
-
-        // Check scalar per vector requirement
-        const auto a_extent_lowest  = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw;
-        const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw;
-        const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw;
-        const auto c_extent_lowest  = NzRaw;
-
-        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
-             b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
-             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
-             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
-        {
-            printf("DeviceOp: Data Transfer Vector scalar err");
-            return false;
-        }
-
-        // Check vector load/store requirement
-        const auto a_stride_lowest =
-            ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0];
-        const auto b0_stride_lowest =
-            B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0];
-        const auto b1_stride_lowest =
-            B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0];
-        const auto c_stride_lowest = arg.c_mz_nz_strides_[1];
-
-        if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
-             c_stride_lowest == 1))
-        {
-            printf("DeviceOp: Data Vectorize transfer err");
-            return false;
-        }
-
-        return true;
-    }
-
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-
-    static auto MakeArgument(
-        const ADataType* p_a,
-        const B0DataType* p_b0,
-        const B1DataType* p_b1,
-        CDataType* p_c,
-        const std::array<void*, NumAcc0Bias> p_acc0_biases,
-        const std::array<void*, NumAcc1Bias> p_acc1_biases,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_strides,
-        AElementwiseOperation a_element_op,
-        B0ElementwiseOperation b0_element_op,
-        AccElementwiseOperation acc_element_op,
-        B1ElementwiseOperation b1_element_op,
-        CElementwiseOperation c_element_op)
-    {
-        return Argument{p_a,
-                        p_b0,
-                        p_b1,
-                        p_c,
-                        p_acc0_biases,
-                        p_acc1_biases,
-                        a_gs_ms_ks_lengths,
-                        a_gs_ms_ks_strides,
-                        b0_gs_ls_ks_lengths,
-                        b0_gs_ls_ks_strides,
-                        b1_gs_ns_ls_lengths,
-                        b1_gs_ns_ls_strides,
-                        c_gs_ms_ns_lengths,
-                        c_gs_ms_ns_strides,
-                        acc0_biases_gs_ms_ls_lengths,
-                        acc0_biases_gs_ms_ls_strides,
-                        acc1_biases_gs_ms_ns_lengths,
-                        acc1_biases_gs_ms_ns_strides,
-                        1,
-                        1,
-                        a_element_op,
-                        b0_element_op,
-                        acc_element_op,
-                        b1_element_op,
-                        c_element_op};
-    }
-#endif
 
     // polymorphic
     std::unique_ptr<BaseArgument> MakeArgumentPointer(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
index 6aa766ab5c..d1269c6d9a 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_query_attention_forward_wmma.hpp
@@ -956,147 +956,6 @@ struct DeviceGroupedQueryAttentionForward_Wmma
         // TODO: properly implement this check
         return true;
     }
-#if 0
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        if(ck::is_gfx11_supported())
-        {
-            if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc0 Type err");
-                return false;
-            }
-
-            if constexpr(!(is_same_v<Acc1DataType, float> || is_same_v<Acc1DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc1 Type err");
-                return false;
-            }
-        }
-        else
-        {
-            printf("DeviceOp: Arch err");
-            return false;
-        }
-
-        if(!GridwiseOp::CheckValidity(arg.a_grid_desc,
-                                      arg.b0_grid_desc,
-                                      arg.b1_grid_desc,
-                                      arg.c_grid_desc_m_n_,
-                                      arg.block_2_ctile_map_))
-        {
-            return false;
-        }
-
-        // Check if C permute dimension matches GEMM + GEMM shape
-        const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded
-
-        if(!(c_g == arg.batch_count_))
-        {
-            printf("DeviceOp: BatchCount err");
-            return false;
-        }
-
-        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
-        // vector is out of bounds
-        // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O
-        const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0];
-        const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1];
-        const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2];
-        const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3];
-
-        // Check scalar per vector requirement
-        const auto a_extent_lowest  = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw;
-        const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw;
-        const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw;
-        const auto c_extent_lowest  = NzRaw;
-
-        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
-             b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
-             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
-             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
-        {
-            printf("DeviceOp: Data Transfer Vector scalar err");
-            return false;
-        }
-
-        // Check vector load/store requirement
-        const auto a_stride_lowest =
-            ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0];
-        const auto b0_stride_lowest =
-            B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0];
-        const auto b1_stride_lowest =
-            B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0];
-        const auto c_stride_lowest = arg.c_mz_nz_strides_[1];
-
-        if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
-             c_stride_lowest == 1))
-        {
-            printf("DeviceOp: Data Vectorize transfer err");
-            return false;
-        }
-
-        return true;
-    }
-
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-
-    static auto MakeArgument(
-        const ADataType* p_a,
-        const B0DataType* p_b0,
-        const B1DataType* p_b1,
-        CDataType* p_c,
-        const std::array<void*, NumAcc0Bias> p_acc0_biases,
-        const std::array<void*, NumAcc1Bias> p_acc1_biases,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_strides,
-        AElementwiseOperation a_element_op,
-        B0ElementwiseOperation b0_element_op,
-        AccElementwiseOperation acc_element_op,
-        B1ElementwiseOperation b1_element_op,
-        CElementwiseOperation c_element_op)
-    {
-        return Argument{p_a,
-                        p_b0,
-                        p_b1,
-                        p_c,
-                        p_acc0_biases,
-                        p_acc1_biases,
-                        a_gs_ms_ks_lengths,
-                        a_gs_ms_ks_strides,
-                        b0_gs_ls_ks_lengths,
-                        b0_gs_ls_ks_strides,
-                        b1_gs_ns_ls_lengths,
-                        b1_gs_ns_ls_strides,
-                        c_gs_ms_ns_lengths,
-                        c_gs_ms_ns_strides,
-                        acc0_biases_gs_ms_ls_lengths,
-                        acc0_biases_gs_ms_ls_strides,
-                        acc1_biases_gs_ms_ns_lengths,
-                        acc1_biases_gs_ms_ns_strides,
-                        1,
-                        1,
-                        a_element_op,
-                        b0_element_op,
-                        acc_element_op,
-                        b1_element_op,
-                        c_element_op};
-    }
-#endif
 
     // polymorphic
     std::unique_ptr<BaseArgument> MakeArgumentPointer(
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
index a303b6f808..a9d916c6a0 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_multi_query_attention_forward_wmma.hpp
@@ -948,147 +948,6 @@ struct DeviceMultiQueryAttentionForward_Wmma
         // TODO: properly implement this check
         return true;
     }
-#if 0
-    static bool IsSupportedArgument(const Argument& arg)
-    {
-        if(ck::is_gfx11_supported())
-        {
-            if constexpr(!(is_same_v<Acc0DataType, float> || is_same_v<Acc0DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc0 Type err");
-                return false;
-            }
-
-            if constexpr(!(is_same_v<Acc1DataType, float> || is_same_v<Acc1DataType, int32_t>))
-            {
-                printf("DeviceOp: Acc1 Type err");
-                return false;
-            }
-        }
-        else
-        {
-            printf("DeviceOp: Arch err");
-            return false;
-        }
-
-        if(!GridwiseOp::CheckValidity(arg.a_grid_desc,
-                                      arg.b0_grid_desc,
-                                      arg.b1_grid_desc,
-                                      arg.c_grid_desc_m_n_,
-                                      arg.block_2_ctile_map_))
-        {
-            return false;
-        }
-
-        // Check if C permute dimension matches GEMM + GEMM shape
-        const index_t c_g = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded
-
-        if(!(c_g == arg.batch_count_))
-        {
-            printf("DeviceOp: BatchCount err");
-            return false;
-        }
-
-        // Note: we need raw lengths since threadwise copy can not handle vector load when part of
-        // vector is out of bounds
-        // Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O
-        const auto MzRaw = arg.raw_lengths_mz_lz_kz_nz_[0];
-        const auto LzRaw = arg.raw_lengths_mz_lz_kz_nz_[1];
-        const auto KzRaw = arg.raw_lengths_mz_lz_kz_nz_[2];
-        const auto NzRaw = arg.raw_lengths_mz_lz_kz_nz_[3];
-
-        // Check scalar per vector requirement
-        const auto a_extent_lowest  = ABlockTransferSrcVectorDim == 2 ? KzRaw : MzRaw;
-        const auto b0_extent_lowest = B0BlockTransferSrcVectorDim == 2 ? KzRaw : LzRaw;
-        const auto b1_extent_lowest = B1BlockTransferSrcVectorDim == 2 ? LzRaw : NzRaw;
-        const auto c_extent_lowest  = NzRaw;
-
-        if(!(a_extent_lowest % ABlockTransferSrcScalarPerVector == 0 &&
-             b0_extent_lowest % B0BlockTransferSrcScalarPerVector == 0 &&
-             b1_extent_lowest % B1BlockTransferSrcScalarPerVector == 0 &&
-             c_extent_lowest % CShuffleBlockTransferScalarPerVector_NPerBlock == 0))
-        {
-            printf("DeviceOp: Data Transfer Vector scalar err");
-            return false;
-        }
-
-        // Check vector load/store requirement
-        const auto a_stride_lowest =
-            ABlockTransferSrcVectorDim == 2 ? arg.a_mz_kz_strides_[1] : arg.a_mz_kz_strides_[0];
-        const auto b0_stride_lowest =
-            B0BlockTransferSrcVectorDim == 2 ? arg.b0_lz_kz_strides_[1] : arg.b0_lz_kz_strides_[0];
-        const auto b1_stride_lowest =
-            B1BlockTransferSrcVectorDim == 2 ? arg.b1_nz_lz_strides_[1] : arg.b1_nz_lz_strides_[0];
-        const auto c_stride_lowest = arg.c_mz_nz_strides_[1];
-
-        if(!(a_stride_lowest == 1 || b0_stride_lowest == 1 || b1_stride_lowest == 1 ||
-             c_stride_lowest == 1))
-        {
-            printf("DeviceOp: Data Vectorize transfer err");
-            return false;
-        }
-
-        return true;
-    }
-
-    // polymorphic
-    bool IsSupportedArgument(const BaseArgument* p_arg) override
-    {
-        return IsSupportedArgument(*dynamic_cast<const Argument*>(p_arg));
-    }
-
-    static auto MakeArgument(
-        const ADataType* p_a,
-        const B0DataType* p_b0,
-        const B1DataType* p_b1,
-        CDataType* p_c,
-        const std::array<void*, NumAcc0Bias> p_acc0_biases,
-        const std::array<void*, NumAcc1Bias> p_acc1_biases,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& a_gs_ms_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b0_gs_ls_ks_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& b1_gs_ns_ls_strides,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_lengths,
-        const std::array<index_t, NumDimG + NumDimM + NumDimN>& c_gs_ms_ns_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc0Bias> acc0_biases_gs_ms_ls_strides,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_lengths,
-        const std::array<std::vector<ck::index_t>, NumAcc1Bias> acc1_biases_gs_ms_ns_strides,
-        AElementwiseOperation a_element_op,
-        B0ElementwiseOperation b0_element_op,
-        AccElementwiseOperation acc_element_op,
-        B1ElementwiseOperation b1_element_op,
-        CElementwiseOperation c_element_op)
-    {
-        return Argument{p_a,
-                        p_b0,
-                        p_b1,
-                        p_c,
-                        p_acc0_biases,
-                        p_acc1_biases,
-                        a_gs_ms_ks_lengths,
-                        a_gs_ms_ks_strides,
-                        b0_gs_ls_ks_lengths,
-                        b0_gs_ls_ks_strides,
-                        b1_gs_ns_ls_lengths,
-                        b1_gs_ns_ls_strides,
-                        c_gs_ms_ns_lengths,
-                        c_gs_ms_ns_strides,
-                        acc0_biases_gs_ms_ls_lengths,
-                        acc0_biases_gs_ms_ls_strides,
-                        acc1_biases_gs_ms_ns_lengths,
-                        acc1_biases_gs_ms_ns_strides,
-                        1,
-                        1,
-                        a_element_op,
-                        b0_element_op,
-                        acc_element_op,
-                        b1_element_op,
-                        c_element_op};
-    }
-#endif
 
     // polymorphic
     std::unique_ptr<BaseArgument> MakeArgumentPointer(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
index d66679a318..76f0b5a893 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_multiple_d_xdl_cshuffle.hpp
@@ -464,12 +464,6 @@ struct GridwiseGemmMultipleD_xdl_cshuffle
             return false;
         }
 
-        // check block-to-E-tile
-        // if(!block_2_etile_map.CheckValidity(e_grid_desc_m_n))
-        //{
-        // return false;
-        //}
-
         // TODO: also check validity of all components (blockwise-copy, threadwise-copy, etc)
         // check tensor size: cannot be larger than 2GB each
         constexpr long_index_t TwoGB = (long_index_t{1} << 31);
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
index 4b679adc8d..2252ebf980 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_streamk_v3.hpp
@@ -351,74 +351,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
         return a_grid_desc_ak0_m_ak1;
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both M and K
-            const auto a_grid_desc_m_k =
-                transform_tensor_descriptor(a_grid_desc_mraw_kraw,
-                                            make_tuple(make_right_pad_transform(M, MPad - M),
-                                                       make_right_pad_transform(K, KPad - K)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_pass_through_transform(MPad)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad M, but not K
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_right_pad_transform(M, MPad - M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad K, but not M
-            const auto a_grid_desc_m_k = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(K, KPad - K)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_m_k,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-        else
-        {
-            // not pad M or K
-            const auto a_grid_desc_ak0_m_ak1 = transform_tensor_descriptor(
-                a_grid_desc_mraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(AK0, AK1Value)),
-                           make_pass_through_transform(M)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return a_grid_desc_ak0_m_ak1;
-        }
-#endif
     }
 
     __device__ static auto MakeBGridDescriptor_BK0_N_BK1(
@@ -451,74 +383,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
             make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
 
         return b_grid_desc_bk0_n_bk1;
-#if 0     
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::NKPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad both N and K
-            const auto b_grid_desc_n_k =
-                transform_tensor_descriptor(b_grid_desc_nraw_kraw,
-                                            make_tuple(make_right_pad_transform(N, NPad - N),
-                                                       make_right_pad_transform(K, KPad - K)),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_n_k,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_pass_through_transform(NPad)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::MNPadding)
-        {
-            // pad N, but not K
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::KPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad K, but not N
-            const auto b_grid_desc_n_k = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_pass_through_transform(N), make_right_pad_transform(K, KPad - K)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_n_k,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-        else
-        {
-            // not pad N or K
-            const auto b_grid_desc_bk0_n_bk1 = transform_tensor_descriptor(
-                b_grid_desc_nraw_kraw,
-                make_tuple(make_unmerge_transform(make_tuple(BK0, BK1Value)),
-                           make_pass_through_transform(N)),
-                make_tuple(Sequence<1>{}, Sequence<0>{}),
-                make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
-
-            return b_grid_desc_bk0_n_bk1;
-        }
-#endif
     }
 
     template <typename ABlockDesc_AK0_M_AK1>
@@ -559,45 +423,6 @@ struct GridwiseGemm_xdl_cshuffle_streamk_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
index 5c5eb9405f..d926efab84 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3.hpp
@@ -682,45 +682,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
index 7f1a42fb26..a81679ea78 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_preshuffle.hpp
@@ -613,45 +613,6 @@ struct GridwiseGemm_xdl_cshuffle_v3_b_preshuffle
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
index daa4fd2e8a..f9be9e494b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_b_scale.hpp
@@ -568,45 +568,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
index f018730300..529248093b 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_abd.hpp
@@ -806,58 +806,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
         index_t b_k_split_offset;
     };
 
-#if 0
-    struct SplitKBatchOffsetMultiABD
-    {
-        __device__ SplitKBatchOffsetMultiABD(AsGridPointer& p_as_grid,
-                                             BsGridPointer& p_bs_grid,
-                                             Argument& karg)
-        {
-            static_for<0, NumATensor, 1>{}([&](auto i) {
-                using ALayout_ = remove_cvref_t<tuple_element_t<i.value, AsLayout>>;
-                if constexpr(is_same_v<tensor_layout::gemm::RowMajor, ALayout_>)
-                {
-                    as_k_split_offset[i] = blockIdx.z * karg.KRead;
-                }
-                else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, ALayout_>)
-                {
-                    as_k_split_offset[i] = blockIdx.z * karg.KRead * karg.StrideAs[i];
-                }
-
-                p_as_grid_(i) = p_as_grid[i] + as_k_split_offset[i];
-            });
-
-            static_for<0, NumBTensor, 1>{}([&](auto i) {
-                using BLayout_ = remove_cvref_t<tuple_element_t<i.value, BsLayout>>;
-                if constexpr(is_same_v<tensor_layout::gemm::RowMajor, BLayout_>)
-                {
-                    bs_k_split_offset[i] = blockIdx.z * karg.KRead * karg.StrideBs[i];
-                }
-                else if constexpr(is_same_v<tensor_layout::gemm::ColumnMajor, BLayout_>)
-                {
-                    bs_k_split_offset[i] = blockIdx.z * karg.KRead;
-                }
-
-                p_bs_grid_(i) = p_bs_grid[i] + bs_k_split_offset[i];
-            });
-
-            if(blockIdx.z < static_cast<uint32_t>(karg.KBatch - 1))
-            {
-                karg.K = karg.KRead;
-            }
-            else
-            {
-                karg.K = karg.K - karg.KRead * (karg.KBatch - 1);
-            }
-        }
-
-        AsGridPointer p_as_grid_;
-        BsGridPointer p_bs_grid_;
-        std::array<index_t, NumATensor> as_k_split_offset;
-        std::array<index_t, NumBTensor> bs_k_split_offset;
-    };
-#endif
-
     using BlockwiseGemmPipe = remove_cvref_t<
         decltype(BlockGemmPipeline_Selector<
                  BlkGemmPipelineVer,
@@ -1129,10 +1077,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
         // BsGridPointer p_bs_grid;
         // DsGridPointer p_ds_grid;
 
-        // const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
-        //    problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
-        // const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
-        //    problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
         const auto as_grid_desc_ak0_m_ak1 = MakeAsGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideAs, problem.AK0);
         const auto bs_grid_desc_bk0_n_bk1 = MakeBsGridDescriptor_BK0_N_BK1(
@@ -1147,22 +1091,10 @@ struct GridwiseGemm_xdl_cshuffle_v3
         const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
             problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
 
-#if 0
-        static_for<0, NumDTensor, 1>{}([&](auto j) {
-            ds_grid_desc_m_n(j) = MakeCGridDescriptor_M_N(
-                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs[j]);
-        });
-#endif
-
         const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
             MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
-        // const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-        //    p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        // const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-        //    p_bs_grid[I0], b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-
         const auto as_grid_buf = generate_tuple(
             [&](auto i) {
                 return make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -1406,10 +1338,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
                                     const BElementwiseOperation& b_element_op,
                                     const CElementwiseOperation& c_element_op)
     {
-        // const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
-        //    problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideA, problem.AK0);
-        // const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
-        //    problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
         const auto as_grid_desc_ak0_m_ak1 = MakeAsGridDescriptor_AK0_M_AK1(
             problem.M, problem.MPadded, problem.K, problem.KPadded, problem.StrideAs, problem.AK0);
         const auto bs_grid_desc_bk0_n_bk1 = MakeBsGridDescriptor_BK0_N_BK1(
@@ -1428,10 +1356,6 @@ struct GridwiseGemm_xdl_cshuffle_v3
             MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
                 ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
 
-        // const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-        //    p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        // const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-        //    p_b_grid, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
         const auto as_grid_buf = generate_tuple(
             [&](auto i) {
                 return make_dynamic_buffer<AddressSpaceEnum::Global>(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
index a3dffed09d..671cfe4967 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d.hpp
@@ -642,45 +642,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     __host__ __device__ static auto MakeDsGridDescriptor_M_N(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
index 36895f55ea..54260d4386 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_ab_scale.hpp
@@ -558,45 +558,6 @@ struct GridwiseGemmMultiD_ABScale_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     __host__ __device__ static auto MakeDsGridDescriptor_M_N(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
index e810a467e7..28bcf14cd0 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_multi_d_b_preshuffle.hpp
@@ -609,45 +609,6 @@ struct GridwiseGemmMultiD_xdl_cshuffle_v3_b_preshuffle
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     __host__ __device__ static auto MakeDsGridDescriptor_M_N(
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
index d2dd1d243c..fa231c9b02 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx.hpp
@@ -669,45 +669,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
index 88f5dd44f3..43a46d6ff4 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdl_cshuffle_v3_mx_bpreshuffle.hpp
@@ -696,45 +696,6 @@ struct GridwiseGemmMX_xdl_cshuffle_v3_bpreshuffle
                                                       make_right_pad_transform(N, NPad - N)),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}),
                                            make_tuple(Sequence<0>{}, Sequence<1>{}));
-#if 0
-        using GemmSpecialization = tensor_operation::device::GemmSpecialization;
-
-        if constexpr(GemmSpec == GemmSpecialization::MNPadding ||
-                     GemmSpec == GemmSpecialization::MNKPadding)
-        {
-            // pad M and N
-            return transform_tensor_descriptor(c_grid_desc_mraw_nraw,
-                                               make_tuple(make_right_pad_transform(M, MPad - M),
-                                                          make_right_pad_transform(N, NPad - N)),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}),
-                                               make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::MPadding ||
-                          GemmSpec == GemmSpecialization::MKPadding)
-        {
-            // pad M, but not N
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_right_pad_transform(M, MPad - M), make_pass_through_transform(N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else if constexpr(GemmSpec == GemmSpecialization::NPadding ||
-                          GemmSpec == GemmSpecialization::NKPadding)
-        {
-            // pad N, but not M
-            return transform_tensor_descriptor(
-                c_grid_desc_mraw_nraw,
-                make_tuple(make_pass_through_transform(M), make_right_pad_transform(N, NPad - N)),
-                make_tuple(Sequence<0>{}, Sequence<1>{}),
-                make_tuple(Sequence<0>{}, Sequence<1>{}));
-        }
-        else
-        {
-            // not pad M or N
-            return c_grid_desc_mraw_nraw;
-        }
-#endif
     }
 
     struct Problem
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
index 79e3a44660..d1d136bcc8 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm.hpp
@@ -30,48 +30,6 @@ namespace ck {
 // 2. Occupied __shared__ won't release until whole shader end, a.k.a AB and C may not use same lds
 // buffer when we declare __shared__ inside blkgemmpipe
 
-#if 0
-template <typename GridwiseGemm,
-          bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          index_t MinimumOccupancy = 1,
-          TailNumber TailNum       = TailNumber::Even>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
-#endif
-    // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_moe_mxgemm(typename GridwiseGemm::Argument karg)
-{
-#if defined(__gfx9__)
-    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
-    {
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_a_scale_grid + splitk_batch_offset.a_k_split_offset,
-        karg.p_b_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_b_scale_grid + splitk_batch_offset.b_k_split_offset,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
-    }
-#else
-    ignore = karg;
-#endif // end of if (defined(__gfx9__))
-}
-#endif
-
 template <typename GridwiseGemm,
           bool HasMainKBlockLoop,
           InMemoryDataOperationEnum CGlobalMemoryDataOperation,
@@ -1235,770 +1193,6 @@ struct GridwiseMoeGemmMX
                       is_same_v<BElementwiseOperation, tensor_operation::element_wise::PassThrough>,
                   "A/B ElementwiseOperation should be PassThrough as load_to_lds is used!");
 
-#if 0
-    template <bool HasMainKBlockLoop,
-              InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-              TailNumber TailNum = TailNumber::Odd>
-    __device__ static void Run(const index_t* p_sorted_token_ids,
-                               const index_t* p_sorted_expert_ids,
-                               const index_t* p_max_token_id,
-                               const ADataType* p_a_grid,
-                               const AScaleDataType* p_a_scale_grid,
-                               const BDataType* p_b_grid,
-                               const BScaleDataType* p_b_scale_grid,
-                               DsGridPointer& p_ds_grid,
-                               CDataType* p_c_grid,
-                               void* p_shared,
-                               const Problem& problem,
-                               AElementwiseOperation a_element_op,
-                               BElementwiseOperation b_element_op,
-                               CElementwiseOperation c_element_op)
-    {
-        ignore                           = a_element_op;
-        ignore                           = b_element_op;
-        const auto a_grid_desc_ak0_m_ak1 = MakeAGridDescriptor_AK0_M_AK1(
-            IsInputGemm ? problem.NumTokens : problem.NumTokens * problem.TopK,
-            problem.MPadded,
-            problem.K,
-            problem.KPadded,
-            problem.StrideA,
-            problem.AK0);
-        const auto b_grid_desc_bk0_n_bk1 = MakeBGridDescriptor_BK0_N_BK1(
-            problem.K, problem.KPadded, problem.N, problem.NPadded, problem.StrideB, problem.BK0);
-        const auto c_grid_desc_m_n = MakeCGridDescriptor_M_N<CLayout>(
-            IsInputGemm ? problem.NumTokens * problem.TopK : problem.NumTokens,
-            problem.MPadded,
-            problem.N,
-            problem.NPadded,
-            problem.StrideC);
-
-        const auto a_scale_grid_desc_am_ak = make_naive_tensor_descriptor_packed(
-            make_tuple(problem.M / (MXdlPack * MPerXdl),
-                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / APackedSize)) /
-                           (KXdlPack * 64 / MPerXdl),
-                       64 * KXdlPack * MXdlPack / scale_pack_size_a));
-
-        const auto b_scale_grid_desc_bn_ak = make_naive_tensor_descriptor_packed(
-            make_tuple(problem.N / (NXdlPack * NPerXdl),
-                       math::integer_divide_ceil(problem.K, (ScaleBlockSize / BPackedSize)) /
-                           (KXdlPack * 64 / NPerXdl),
-                       64 * KXdlPack * NXdlPack / scale_pack_size_b));
-
-        const auto c_grid_desc_mblock_mperblock_nblock_nperblock =
-            MakeCGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                c_grid_desc_m_n, problem.MBlock, problem.NBlock);
-
-        const index_t max_token_id    = __builtin_amdgcn_readfirstlane(p_max_token_id[0]);
-        const index_t expert_block_id = NSwizzle ? blockIdx.x / problem.NBlock : blockIdx.y;
-        if(expert_block_id * MPerBlock >= max_token_id)
-            return;
-        const index_t expert_id =
-            __builtin_amdgcn_readfirstlane(p_sorted_expert_ids[expert_block_id]);
-
-        const auto block_mn = [&]() -> std::pair<int, int> {
-            if constexpr(NSwizzle)
-            {
-                const index_t ecnt_prefix  = p_max_token_id[1 + expert_id];
-                const index_t prefix_block = ecnt_prefix * problem.NBlock;
-                const index_t ecnt         = p_max_token_id[2 + expert_id] - ecnt_prefix;
-                const index_t expert_swizzle =
-                    ecnt > 0 ? ecnt : 1; // p_max_token_id[expert_id + 1]; // 2
-                const index_t bid_new = blockIdx.x - prefix_block;
-                const index_t nid     = __builtin_amdgcn_readfirstlane(
-                    bid_new % 8 + bid_new / (8 * expert_swizzle) * 8);
-                const index_t mid =
-                    __builtin_amdgcn_readfirstlane(ecnt_prefix + bid_new / 8 % expert_swizzle);
-                return {nid, mid};
-            }
-            else
-            {
-                return {blockIdx.x, blockIdx.y};
-            }
-        }();
-
-        const index_t block_n_id = block_mn.first;
-        const index_t block_m_id = block_mn.second;
-        const index_t token0 =
-            __builtin_amdgcn_readfirstlane(p_sorted_token_ids[block_m_id * MPerBlock] & 0xffffff);
-
-        // constexpr auto M0 = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-        constexpr auto AMThreads  = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I1);
-        constexpr auto AK0Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I0);
-        constexpr auto AK1Threads = ABlockTransferThreadClusterLengths_AK0_M_AK1{}.At(I2);
-        constexpr auto AKThreads  = AK0Threads * AK1Threads;
-        constexpr auto AMRepeats  = MPerBlock / AMThreads;
-        const index_t token_pos   = block_m_id * MPerBlock + threadIdx.x / AKThreads * AMRepeats;
-
-        if(token_pos >= max_token_id || token0 >= problem.NumTokens)
-            return;
-        StaticallyIndexedArray<IndexType, AMRepeats> gather_offsets;
-        static_for<0, AMRepeats, 1>{}([&](auto m0) {
-            const index_t fused_token = p_sorted_token_ids[token_pos + m0];
-            index_t token_offset      = fused_token & 0xffffff;
-            if constexpr(!IsInputGemm)
-            {
-                token_offset = token_offset * problem.TopK + (fused_token >> 24);
-            }
-            gather_offsets(m0) = static_cast<IndexType>(token_offset);
-        });
-
-        const long_index_t expert_stride =
-            __builtin_amdgcn_readfirstlane(static_cast<long_index_t>(problem.N) * problem.K * (IsInputGemm ? 2 : 1));
-        const long_index_t expert_scale_stride = __builtin_amdgcn_readfirstlane(
-            static_cast<long_index_t>(problem.N) * (IsInputGemm ? 2 : 1) *
-            math::integer_divide_ceil(problem.K, ScaleBlockSize / BPackedSize));
-
-        // N0, K0, Blocksize*KPack
-        const index_t n_block_data_idx_on_grid =
-            __builtin_amdgcn_readfirstlane(block_n_id * NPerBlock);
-
-        // Gride buffer creation
-        const auto a_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_grid, a_grid_desc_ak0_m_ak1.GetElementSpaceSize());
-        const auto b_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_grid + static_cast<long_index_t>(expert_id) * expert_stride, b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-
-        // A, B scale buffer
-        const auto a_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_a_scale_grid, a_scale_grid_desc_am_ak.GetElementSpaceSize());
-        const auto b_scale_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-            p_b_scale_grid + (static_cast<long_index_t>(expert_id) * expert_scale_stride) / sizeof(BScaleDataType),
-            b_scale_grid_desc_bn_ak.GetElementSpaceSize());
-
-        // lds max alignment
-        constexpr auto max_lds_align = math::lcm(AK1Number, BK1Number);
-
-        // A matrix in LDS memory, dst of blockwise copy
-        constexpr auto a_block_desc_ak0_m_ak1 = GetABlockDescriptor_AK0PerBlock_MPerBlock_AK1();
-
-        // B matrix in LDS memory, dst of blockwise copy
-        constexpr auto b_block_desc_bk0_n_bk1 = GetBBlockDescriptor_BK0PerBlock_NPerBlock_BK1();
-
-        // A matrix blockwise direct to LDS copy
-        auto a_blockwise_copy = ThreadGroupTensorSliceTransfer_Gather_DirectLoad<
-            ThisThreadBlock,
-            Sequence<AK0Number, MPerBlock, AK1Number>,
-            ABlockTransferThreadClusterLengths_AK0_M_AK1,
-            ABlockTransferThreadClusterArrangeOrder,
-            ADataType,
-            ADataType,
-            decltype(a_grid_desc_ak0_m_ak1),
-            decltype(a_block_desc_ak0_m_ak1),
-            ABlockTransferSrcAccessOrder,
-            ABlockTransferSrcVectorDim,
-            2,
-            ABlockTransferSrcScalarPerVector,
-            IndexType,
-            1>(a_grid_desc_ak0_m_ak1,
-               make_multi_index(0, 0, 0),
-               a_block_desc_ak0_m_ak1,
-               make_multi_index(0, 0, 0),
-               gather_offsets);
-
-        // B matrix blockwise copy
-        auto b_blockwise_copy =
-            ThreadGroupTensorSliceTransfer_DirectLoad<ThisThreadBlock,
-                                                      Sequence<BK0Number, NPerBlock, BK1Number>,
-                                                      BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                                                      BBlockTransferThreadClusterArrangeOrder,
-                                                      BDataType,
-                                                      BDataType,
-                                                      decltype(b_grid_desc_bk0_n_bk1),
-                                                      decltype(b_block_desc_bk0_n_bk1),
-                                                      BBlockTransferSrcAccessOrder,
-                                                      BBlockTransferSrcVectorDim,
-                                                      2,
-                                                      BBlockTransferSrcScalarPerVector>(
-                b_grid_desc_bk0_n_bk1,
-                make_multi_index(0, n_block_data_idx_on_grid, 0),
-                b_block_desc_bk0_n_bk1,
-                make_multi_index(0, 0, 0));
-
-        // LDS allocation for A and B: be careful of alignment
-        constexpr auto a_block_space_size_aligned = math::integer_least_multiple(
-            a_block_desc_ak0_m_ak1.GetElementSpaceSize(), max_lds_align);
-
-        // Cast after lds
-        auto a_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<ADataType*>(p_shared), a_block_desc_ak0_m_ak1.GetElementSpaceSize());
-
-        auto b_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
-                                         a_block_space_size_aligned * sizeof(ADataType)),
-            b_block_desc_bk0_n_bk1.GetElementSpaceSize());
-
-        constexpr auto a_block_slice_copy_step = make_multi_index(KPerBlock / AK1Number, 0, 0);
-        constexpr auto b_block_slice_copy_step = make_multi_index(KPerBlock / BK1Number, 0, 0);
-
-        // Blockwise GEMM pipeline
-        static_assert(std::is_default_constructible_v<BlockwiseGemmPipe>);
-        auto blockwise_gemm_pipeline = BlockwiseGemmPipe{};
-        auto c_thread_buf            = blockwise_gemm_pipeline.GetCThreadBuffer();
-        decltype(c_thread_buf) c_thread_buf_up;
-
-        StaticBufferTupleOfVector<AddressSpaceEnum::Vgpr,
-                                  float,
-                                  c_thread_buf.num_of_v_,
-                                  c_thread_buf.s_per_v,
-                                  true>
-            c_thread_buf_fp32;
-
-        const index_t num_k_block_main_loop = __builtin_amdgcn_readfirstlane(
-            (a_grid_desc_ak0_m_ak1.GetLength(I0) * a_grid_desc_ak0_m_ak1.GetLength(I2)) /
-            KPerBlock);
-
-        // a and b scale processing
-        const auto wave_idx = BlockwiseGemmPipe::GetWaveIdx();
-        const auto waveId_m = wave_idx[I0];
-        const auto waveId_n = wave_idx[I1];
-
-        auto thread_offset_shuffled =
-            get_thread_local_1d_id() % BlockwiseGemmPipe::WaveSize * KXdlPack * MXdlPack;
-
-        auto a_thread_offset_m = waveId_m;
-
-        auto a_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
-            AScaleDataType,
-            AScaleDataType,
-            decltype(a_scale_grid_desc_am_ak),
-            decltype(BlockwiseGemmPipe::a_scale_thread_desc),
-            Sequence<1, 1, KXdlPack * MXdlPack / scale_pack_size_a>, // SliceLengths
-            Sequence<0, 1, 2>,                                       // DimAccessOrder
-            2,                                                       // SrcVectorDim
-            KXdlPack * MXdlPack / scale_pack_size_a,                 // SrcScalarPerVector
-            1,                                                       // SrcScalarStrideInVector
-            true>(a_scale_grid_desc_am_ak,
-                  make_multi_index(block_m_id * MPerBlock / MPerXdl / MXdlPack + a_thread_offset_m,
-                                   0,
-                                   thread_offset_shuffled / scale_pack_size_a));
-
-        // B scale load
-        auto b_thread_offset_n = waveId_n;
-
-        auto b_scale_thread_copy = ThreadwiseTensorSliceTransfer_v2<
-            BScaleDataType,
-            BScaleDataType,
-            decltype(b_scale_grid_desc_bn_ak),
-            decltype(BlockwiseGemmPipe::b_scale_thread_desc),
-            Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
-            Sequence<0, 1, 2>,                                       // DimAccessOrder
-            2,                                                       // SrcVectorDim
-            KXdlPack * NXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
-            1,                                                       // SrcScalarStrideInVector
-            true>(b_scale_grid_desc_bn_ak,
-                  make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
-                                   0,
-                                   thread_offset_shuffled / scale_pack_size_b));
-
-        if constexpr(IsInputGemm)
-        {
-            constexpr auto b_block_space_size_aligned = math::integer_least_multiple(
-                b_block_desc_bk0_n_bk1.GetElementSpaceSize(), max_lds_align);
-            auto b_block_buf_up = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-                reinterpret_cast<BDataType*>(static_cast<char*>(p_shared) +
-                                             a_block_space_size_aligned * sizeof(ADataType) +
-                                             b_block_space_size_aligned * sizeof(BDataType)),
-                b_block_desc_bk0_n_bk1.GetElementSpaceSize());
-
-            const BDataType* p_b_grid_up = p_b_grid + expert_stride / 2;
-            const auto b_grid_buf_up     = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_grid_up + static_cast<long_index_t>(expert_id) * expert_stride,
-                b_grid_desc_bk0_n_bk1.GetElementSpaceSize());
-
-            auto b_blockwise_copy_up = ThreadGroupTensorSliceTransfer_DirectLoad<
-                ThisThreadBlock,
-                Sequence<BK0Number, NPerBlock, BK1Number>,
-                BBlockTransferThreadClusterLengths_BK0_N_BK1,
-                BBlockTransferThreadClusterArrangeOrder,
-                BDataType,
-                BDataType,
-                decltype(b_grid_desc_bk0_n_bk1),
-                decltype(b_block_desc_bk0_n_bk1),
-                BBlockTransferSrcAccessOrder,
-                BBlockTransferSrcVectorDim,
-                2,
-                BBlockTransferSrcScalarPerVector>(b_grid_desc_bk0_n_bk1,
-                                                  make_multi_index(0, n_block_data_idx_on_grid, 0),
-                                                  b_block_desc_bk0_n_bk1,
-                                                  make_multi_index(0, 0, 0));
-
-            const BScaleDataType* p_b_scale_grid_up =
-                p_b_scale_grid + expert_scale_stride / 2 / sizeof(BScaleDataType);
-            const auto b_scale_grid_buf_up = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_b_scale_grid_up + static_cast<long_index_t>(expert_id) * expert_scale_stride / sizeof(BScaleDataType),
-                b_scale_grid_desc_bn_ak.GetElementSpaceSize());
-
-            auto b_scale_thread_copy_up = ThreadwiseTensorSliceTransfer_v2<
-                BScaleDataType,
-                BScaleDataType,
-                decltype(b_scale_grid_desc_bn_ak),
-                decltype(BlockwiseGemmPipe::b_scale_thread_desc),
-                Sequence<1, 1, KXdlPack * NXdlPack / scale_pack_size_b>, // SliceLengths
-                Sequence<0, 1, 2>,                                       // DimAccessOrder
-                2,                                                       // SrcVectorDim
-                KXdlPack * MXdlPack / scale_pack_size_b,                 // SrcScalarPerVector
-                1,                                                       // SrcScalarStrideInVector
-                true>(
-                b_scale_grid_desc_bn_ak,
-                make_multi_index(block_n_id * NPerBlock / NPerXdl / NXdlPack + b_thread_offset_n,
-                                 0,
-                                 thread_offset_shuffled / scale_pack_size_b));
-
-            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
-                // A
-                a_grid_desc_ak0_m_ak1,
-                a_block_desc_ak0_m_ak1,
-                a_blockwise_copy,
-                a_grid_buf,
-                a_block_buf,
-                a_block_slice_copy_step,
-                // Gate and Up
-                b_grid_desc_bk0_n_bk1,
-                b_block_desc_bk0_n_bk1,
-                b_blockwise_copy,
-                b_blockwise_copy_up,
-                b_grid_buf,
-                b_grid_buf_up,
-                b_block_buf,
-                b_block_buf_up,
-                b_block_slice_copy_step,
-                // C
-                c_thread_buf,
-                c_thread_buf_up,
-                // A scale
-                a_scale_grid_desc_am_ak,
-                a_scale_thread_copy,
-                a_scale_grid_buf,
-                // Gate and Up scale
-                b_scale_grid_desc_bn_ak,
-                b_scale_thread_copy,
-                b_scale_thread_copy_up,
-                b_scale_grid_buf,
-                b_scale_grid_buf_up,
-                num_k_block_main_loop);
-        }
-        else
-        {
-            blockwise_gemm_pipeline.template Run<HasMainKBlockLoop, TailNum>(
-                a_grid_desc_ak0_m_ak1, // A
-                a_block_desc_ak0_m_ak1,
-                a_blockwise_copy,
-                a_grid_buf,
-                a_block_buf,
-                a_block_slice_copy_step,
-                b_grid_desc_bk0_n_bk1, // B
-                b_block_desc_bk0_n_bk1,
-                b_blockwise_copy,
-                b_grid_buf,
-                b_block_buf,
-                b_block_slice_copy_step,
-                c_thread_buf,            // C
-                a_scale_grid_desc_am_ak, // A scale
-                a_scale_thread_copy,
-                a_scale_grid_buf,
-                b_scale_grid_desc_bn_ak, // B scale
-                b_scale_thread_copy,
-                b_scale_grid_buf,
-                num_k_block_main_loop);
-        }
-
-        // shuffle C and write out
-        {
-            static_assert(MXdlPerWave % CShuffleMXdlPerWavePerShuffle == 0 &&
-                              NXdlPerWave % CShuffleNXdlPerWavePerShuffle == 0,
-                          "wrong!");
-            static_assert(CShuffleMXdlPerWavePerShuffle % MXdlPack == 0 &&
-                              CShuffleNXdlPerWavePerShuffle % NXdlPack == 0,
-                          "wrong!");
-
-            constexpr index_t MWave = MPerBlock / (MXdlPerWave * MPerXdl);
-            constexpr index_t NWave = NPerBlock / (NXdlPerWave * NPerXdl);
-
-            // TODO: hacky, fix it!
-            constexpr auto c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2 =
-                blockwise_gemm_pipeline.GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
-
-            // TODO: hacky, fix it!
-            // c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp is only used to get lengths
-            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp =
-                blockwise_gemm_pipeline.GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3();
-
-            constexpr auto M0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I0);
-            constexpr auto N0 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I1);
-            constexpr auto M1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I2);
-            constexpr auto N1 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I3);
-            constexpr auto M2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I4);
-            constexpr auto N2 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I5);
-            constexpr auto M3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I6);
-            constexpr auto M4 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I7);
-            constexpr auto M5 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I8);
-            constexpr auto N3 = c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2_tmp.GetLength(I9);
-
-            // mul scales
-            static_assert(M0 * M1 * M2 * M3 * M4 * M5 == MPerBlock);
-            static_assert(M5 == 4);
-            const index_t m1 = get_warp_local_1d_id() / NWave; // Mwave id
-            const index_t m4 = threadIdx.x % get_warp_size() / MPerXdl;
-
-            vector_type<float, 4> topk_weights; // for gemm2 only
-            static_for<0, NXdlPerWave / NXdlPack, 1>{}([&](auto n0) {
-                static_for<0, NXdlPack, 1>{}([&](auto inxdl) {                // NXdlPack
-                    static_for<0, MXdlPerWave / MXdlPack, 1>{}([&](auto m0) { // MXDLPerWave
-                        static_for<0, MXdlPack, 1>{}([&](auto imxdl) {        // MXdlPack
-                            static_for<0, M3, 1>{}([&](auto m3) { // m_inst_num_groups_per_blk
-                                const index_t m_pos = block_m_id * MPerBlock +
-                                                      m0 * M2 * M1 * M3 * M4 * M5 +
-                                                      m1 * M2 * M3 * M4 * M5 +
-                                                      imxdl * M3 * M4 * M5 + m3 * M4 * M5 + m4 * M5;
-                                if constexpr(MulRoutedWeight)
-                                {
-                                    topk_weights =
-                                        *c_style_pointer_cast<const vector_type<float, M5>*>(
-                                            p_ds_grid[I2] + m_pos);
-                                }
-                                static_for<0, M5, 1>{}([&](auto m5) { // m_inst_group_size
-                                    constexpr index_t c_offset =
-                                        blockwise_gemm_pipeline.GetCThreadDesc().CalculateOffset(
-                                            make_tuple(m0, n0, imxdl, inxdl, m3 * M5 + m5));
-                                    constexpr auto cidx = Number<c_offset>{};
-
-                                    if constexpr(IsInputGemm) // gu fusion
-                                    {
-                                        if constexpr(ActivationOperation ==
-                                                     Activation::silu_and_mul)
-                                        {
-                                            float gate = c_thread_buf[cidx];
-                                            float up   = c_thread_buf_up[cidx];
-                                            if constexpr(MulRoutedWeight)
-                                            {
-                                                gate = gate * topk_weights.AsType<float>()[m5];
-                                                up   = up * topk_weights.AsType<float>()[m5];
-                                            }
-                                            tensor_operation::element_wise::Silu{}(gate, gate);
-                                            c_thread_buf_fp32(cidx) = gate * up;
-                                        }
-                                        else if(ActivationOperation == Activation::gelu_and_mul)
-                                        {
-                                            float gate = c_thread_buf[cidx];
-                                            float up   = c_thread_buf_up[cidx];
-                                            if constexpr(MulRoutedWeight)
-                                            {
-                                                gate = gate * topk_weights.AsType<float>()[m5];
-                                                up   = up * topk_weights.AsType<float>()[m5];
-                                            }
-                                            tensor_operation::element_wise::Gelu{}(gate, gate);
-                                            c_thread_buf_fp32(cidx) = gate * up;
-
-                                            /*float gate = c_thread_buf[cidx];
-                                            float up   = c_thread_buf_up[cidx];
-                                            if constexpr(MulRoutedWeight)
-                                            {
-                                                gate = gate * topk_weights.AsType<float>()[m5];
-                                                //up   = up * topk_weights.AsType<float>()[m5];
-                                            }
-                                            tensor_operation::element_wise::Gelu{}(gate, gate);
-                                            c_thread_buf_fp32(cidx) = up;*/
-                                        }
-                                    }
-                                    else
-                                    {
-                                        c_thread_buf_fp32(cidx) = c_thread_buf[cidx];
-                                        if constexpr(MulRoutedWeight)
-                                        {
-                                            c_thread_buf_fp32(cidx) =
-                                                topk_weights.AsType<float>()[m5] *
-                                                c_thread_buf_fp32[cidx];
-                                        }
-                                    }
-                                });
-                            });
-                        });
-                    });
-                });
-            });
-
-            constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
-                GetCShuffleBlockDescriptor_MBlock_MPerBlock_NBlock_NPerBlock();
-
-            auto c_shuffle_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-                static_cast<CShuffleDataType*>(p_shared),
-                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-
-            constexpr auto c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2 = transform_tensor_descriptor(
-                c_shuffle_block_desc_mblock_mperblock_nblock_nperblock,
-                make_tuple(
-                    make_freeze_transform(I0),
-                    make_unmerge_transform(make_tuple(
-                        Number<CShuffleMXdlPerWavePerShuffle / MXdlPack>{}, // M0 (MXdlPerWave)
-                                                                            // per shuffle
-                        M1,                                                 // M1 = MWave
-                        M2,                                                 // M2 = MXdlPack
-                        M3, // M3 * M4 * M5 = MPerXdl
-                        M4,
-                        M5)),
-                    make_freeze_transform(I0),
-                    make_unmerge_transform(make_tuple(
-                        Number<CShuffleNXdlPerWavePerShuffle / NXdlPack>{}, // N0 (NXdlPerWave)
-                                                                            // per shuffle
-                        N1,                                                 // N1 = NWave
-                        N2,                                                 // N2 = NXdlPack
-                        N3))),                                              // N3 = NPerXdl
-                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}),
-                make_tuple(Sequence<>{},
-                           Sequence<0, 2, 4, 6, 7, 8>{},
-                           Sequence<>{},
-                           Sequence<1, 3, 5, 9>{}));
-
-            // calculate origin of thread output tensor on global memory
-            //     blockwise GEMM c matrix starting index
-            const auto c_thread_mtx_on_block =
-                blockwise_gemm_pipeline.CalculateCThreadOriginDataIndex(I0, I0, I0, I0);
-
-            const index_t m_thread_data_on_block = c_thread_mtx_on_block[I0];
-            const index_t n_thread_data_on_block = c_thread_mtx_on_block[I1];
-
-            const auto m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor =
-                make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(M0, M1, M2, M3, M4, M5))),
-                    make_tuple(Sequence<0, 1, 2, 3, 4, 5>{}),
-                    make_tuple(Sequence<0>{}));
-
-            const auto m_thread_data_on_block_idx =
-                m_thread_data_on_block_to_m0_m1_m2_m3_m4_adaptor.CalculateBottomIndex(
-                    make_multi_index(m_thread_data_on_block));
-
-            const auto n_thread_data_on_block_to_n0_n1_n2_adaptor =
-                make_single_stage_tensor_adaptor(
-                    make_tuple(make_merge_transform(make_tuple(N0, N1, N2, N3))),
-                    make_tuple(Sequence<0, 1, 2, 3>{}),
-                    make_tuple(Sequence<0>{}));
-
-            const auto n_thread_data_on_block_idx =
-                n_thread_data_on_block_to_n0_n1_n2_adaptor.CalculateBottomIndex(
-                    make_multi_index(n_thread_data_on_block));
-
-            // shuffle: threadwise copy C from VGPR to LDS
-            auto c_thread_copy_vgpr_to_lds = ThreadwiseTensorSliceTransfer_v1r3<
-                AccDataType,
-                CShuffleDataType,
-                decltype(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                decltype(c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2),
-                ck::tensor_operation::element_wise::PassThrough,
-                Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
-                         CShuffleNXdlPerWavePerShuffle / NXdlPack,
-                         I1,
-                         I1,
-                         M2,
-                         N2,
-                         M3,
-                         I1,
-                         M5,
-                         I1>,
-                Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
-                9,
-                1,
-                InMemoryDataOperationEnum::Set,
-                1,
-                true>{c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                      make_multi_index(0,
-                                       0,
-                                       m_thread_data_on_block_idx[I1],
-                                       n_thread_data_on_block_idx[I1],
-                                       m_thread_data_on_block_idx[I2],
-                                       n_thread_data_on_block_idx[I2],
-                                       m_thread_data_on_block_idx[I3],
-                                       m_thread_data_on_block_idx[I4],
-                                       m_thread_data_on_block_idx[I5],
-                                       n_thread_data_on_block_idx[I3]),
-                      ck::tensor_operation::element_wise::PassThrough{}};
-
-            using EDataType = CDataType;
-
-            const auto ds_grid_desc_m_n = MakeDsGridDescriptor_M_N(
-                problem.M, problem.MPadded, problem.N, problem.NPadded, problem.StrideDs);
-
-            const auto ds_grid_desc_mblock_mperblock_nblock_nperblock =
-                MakeDsGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock(
-                    ds_grid_desc_m_n, problem.MBlock, problem.NBlock);
-
-            const auto ds_grid_buf = generate_tuple(
-                [&](auto i) {
-                    return make_dynamic_buffer<AddressSpaceEnum::Global>(
-                        p_ds_grid[i], ds_grid_desc_m_n[i].GetElementSpaceSize());
-                },
-                Number<NumDTensor>{});
-
-            // tuple of reference to C/Ds tensor descriptors
-            const auto c_ds_desc_refs = concat_tuple_of_reference(
-                tie(c_shuffle_block_desc_mblock_mperblock_nblock_nperblock),
-                generate_tie([&](auto i) -> const auto& // return type should be reference
-                             { return ds_grid_desc_mblock_mperblock_nblock_nperblock[i]; },
-                             Number<NumDTensor>{}));
-
-            // tuple of reference to C/Ds tensor descriptors
-            const auto c_ds_buf_refs = concat_tuple_of_reference(
-                tie(c_shuffle_block_buf),
-                generate_tie([&](auto i) -> const auto& // return type should be reference
-                             { return ds_grid_buf[i]; },
-                             Number<NumDTensor>{}));
-
-            // tuple of starting index of C/Ds blockwise copy
-            const auto idx_c_ds_block_begin =
-                container_concat(make_tuple(make_multi_index(0, 0, 0, 0)),
-                                 generate_tuple(
-                                     [&](auto) {
-                                         return make_multi_index(block_m_id, 0, block_n_id, 0);
-                                         // return make_multi_index(block_work_idx[I0], 0,
-                                         // block_work_idx[I1], 0);
-                                     },
-                                     Number<NumDTensor>{}));
-
-            const auto e_grid_desc_mblock_mperblock_nblock_nperblock =
-                c_grid_desc_mblock_mperblock_nblock_nperblock;
-
-            using CDEBlockTransferCluster =
-                CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock;
-            const auto EGlobalMemoryDataOperation = CGlobalMemoryDataOperation;
-            constexpr index_t scatter_weight_idx  = 3; // hack fix felix
-            auto cde_block_copy_lds_and_global    = ThreadGroupTensorSliceTransfer_v7r3_scatter<
-                   ThisThreadBlock,
-                   decltype(container_concat(make_tuple(CShuffleDataType{}), DsDataType{})),
-                   Tuple<EDataType>,
-                   decltype(c_ds_desc_refs),
-                   decltype(tie(e_grid_desc_mblock_mperblock_nblock_nperblock)),
-                   CElementwiseOperation,
-                   Sequence<static_cast<index_t>(EGlobalMemoryDataOperation)>, // FIXME: make
-                                                                               // Sequence support
-                                                                               // arbitray type
-                   Sequence<1,
-                            CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                            1,
-                            CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>, // BlockSliceLengths,
-                   CDEBlockTransferCluster,
-                   Sequence<0, 1, 2, 3>, // typename ThreadClusterArrangeOrder,
-                   Sequence<0, 1, 2, 3>, // typename SrcDimAccessOrder,
-                   Sequence<0, 1, 2, 3>, // typename DstDimAccessOrder,
-                   3,                    // index_t SrcVectorDim,
-                   3,                    // index_t DstVectorDim,
-                   CDEShuffleBlockTransferScalarPerVectors,
-                   CShuffleBlockTransferScalarPerVector_NPerBlock,
-                   sequence_merge_t<
-                       Sequence<true>,
-                       uniform_sequence_gen_t<NumDTensor,
-                                              false>>, // ThreadTransferSrcResetCoordinateAfterRunFlags
-                   Sequence<false>, // ThreadTransferDstResetCoordinateAfterRunFlags
-                   IndexType,
-                   1,                 // ScatterDim
-                   true,              // OutputScatter: false, only use scatter weights
-                   scatter_weight_idx // ScatterWeightIdx: ascale
-                   >{c_ds_desc_refs,
-                     idx_c_ds_block_begin,
-                     tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                     make_tuple(make_multi_index(0, 0, block_n_id, 0)),
-                     c_element_op};
-
-            auto c_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                p_c_grid, c_grid_desc_mblock_mperblock_nblock_nperblock.GetElementSpaceSize());
-
-            constexpr auto sfc_c_vgpr =
-                SpaceFillingCurve<Sequence<MXdlPerWave / MXdlPack,
-                                           NXdlPerWave / NXdlPack,
-                                           1,
-                                           1,
-                                           MXdlPack,
-                                           NXdlPack,
-                                           M2,
-                                           1,
-                                           M4,
-                                           1>,
-                                  Sequence<0, 1, 2, 3, 4, 5, 6, 7, 8, 9>,
-                                  Sequence<CShuffleMXdlPerWavePerShuffle / MXdlPack,
-                                           CShuffleNXdlPerWavePerShuffle / NXdlPack,
-                                           1,
-                                           1,
-                                           MXdlPack,
-                                           NXdlPack,
-                                           M2,
-                                           1,
-                                           M4,
-                                           1>>{};
-
-            constexpr index_t num_access = sfc_c_vgpr.GetNumOfAccess();
-
-            // space filling curve for shuffled blockwise C/D/E
-            constexpr auto sfc_cde_block =
-                SpaceFillingCurve<Sequence<1, MPerBlock, 1, NPerBlock>,
-                                  Sequence<0, 2, 1, 3>,
-                                  Sequence<1,
-                                           CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl,
-                                           1,
-                                           CShuffleNXdlPerWavePerShuffle * NWave * NPerXdl>>{};
-
-            static_assert(num_access == sfc_cde_block.GetNumOfAccess(), "wrong!");
-            constexpr auto EMThreads =
-                CDEBlockTransferCluster{}.At(I0) * CDEBlockTransferCluster{}.At(I1);
-            constexpr auto EMRepeats = CShuffleMXdlPerWavePerShuffle * MWave * MPerXdl / EMThreads;
-            constexpr auto ENThreads =
-                CDEBlockTransferCluster{}.At(I2) * CDEBlockTransferCluster{}.At(I3);
-            static_for<0, num_access, 1>{}([&](auto access_id) {
-                // make sure it's safe to write to LDS
-                StaticallyIndexedArray<IndexType, EMRepeats> scatter_offsets;
-
-                auto dstidx = sfc_cde_block.GetIndex(access_id);
-                const index_t c_token_pos =
-                    block_m_id * MPerBlock + threadIdx.x / ENThreads * EMRepeats + dstidx(I1);
-                static_for<0, EMRepeats, 1>{}([&](auto m0) {
-                    const index_t fused_token = p_sorted_token_ids[c_token_pos + m0];
-                    IndexType token_offset    = fused_token & 0xffffff;
-                    if constexpr(IsInputGemm)
-                    {
-                        token_offset = token_offset * problem.TopK + (fused_token >> 24);
-                    }
-                    scatter_offsets(m0) = static_cast<IndexType>(token_offset) * problem.N;
-                });
-
-                block_sync_lds();
-
-                // each thread write its data from VGPR to LDS
-                c_thread_copy_vgpr_to_lds.Run(c_thread_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                              sfc_c_vgpr.GetIndexTupleOfNumber(access_id),
-                                              c_thread_buf_fp32,
-                                              c_block_desc_m0_n0_m1_n1_m2_m3_m4_n2,
-                                              c_shuffle_block_buf);
-
-                // make sure it's safe to read from LDS
-                block_sync_lds();
-
-                // each block copy its data from LDS to global
-                cde_block_copy_lds_and_global.Run(
-                    c_ds_desc_refs,
-                    c_ds_buf_refs,
-                    tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                    tie(c_grid_buf),
-                    scatter_offsets);
-
-                if constexpr(access_id < num_access - 1)
-                {
-                    constexpr auto cde_lds_and_global_step =
-                        sfc_cde_block.GetForwardStep(access_id);
-
-                    // move on Ds
-                    static_for<0, NumDTensor, 1>{}([&](auto i) {
-                        cde_block_copy_lds_and_global.MoveSrcSliceWindow(
-                            c_ds_desc_refs, i + I1, cde_lds_and_global_step);
-                    });
-
-                    // move on E
-                    cde_block_copy_lds_and_global.MoveDstSliceWindow(
-                        tie(e_grid_desc_mblock_mperblock_nblock_nperblock),
-                        I0,
-                        cde_lds_and_global_step);
-                }
-            });
-        }
-    }
-#endif
-
     template <bool HasMainKBlockLoop,
               InMemoryDataOperationEnum CGlobalMemoryDataOperation,
               TailNumber TailNum = TailNumber::Odd>
diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
index 8559b78fe0..d428cb5e99 100644
--- a/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_moe_mx_gemm_bns.hpp
@@ -70,50 +70,6 @@ __launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
 #endif // end of if (defined(__gfx9__))
 }
 
-#if 0
-template <typename GridwiseGemm,
-          bool HasMainKBlockLoop,
-          InMemoryDataOperationEnum CGlobalMemoryDataOperation,
-          index_t MinimumOccupancy = 1,
-          TailNumber TailNum       = TailNumber::Even>
-__global__ void
-#if CK_USE_LAUNCH_BOUNDS
-__launch_bounds__(GridwiseGemm::MaxBlockSize, MinimumOccupancy)
-#endif
-    // __attribute__((amdgpu_waves_per_eu(1, 1)))
-    kernel_moe_mxgemm_2lds(typename GridwiseGemm::Argument karg)
-{
-#if defined(__gfx9__)
-    if constexpr(GridwiseGemm::template IsValidCompilationParameter<CGlobalMemoryDataOperation>())
-    {
-    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-    __shared__ char p_shared1[GridwiseGemm::GetSharedMemoryNumberOfByte()];
-
-    // auto splitk_batch_offset = typename GridwiseGemm::SplitKBatchOffset(karg, blockIdx.z);
-
-    GridwiseGemm::template Run_2Lds<HasMainKBlockLoop, CGlobalMemoryDataOperation, TailNum>(
-        karg.p_sorted_token_ids,
-        karg.p_sorted_expert_ids,
-        karg.p_max_token_id,
-        karg.p_a_grid,
-        karg.p_a_scale_grid,
-        karg.p_b_grid,
-        karg.p_b_scale_grid,
-        karg.p_ds_grid,
-        karg.p_c_grid,
-        p_shared,
-        p_shared1,
-        karg,
-        karg.a_element_op,
-        karg.b_element_op,
-        karg.c_element_op);
-    }
-#else
-    ignore = karg;
-#endif // end of if (defined(__gfx9__))
-}
-#endif
-
 template <typename ALayout,
           typename BLayout,
           typename DsLayout,
diff --git a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
index 1b60e6cb53..09cf0ec87b 100644
--- a/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/warp/wmma_gemm.hpp
@@ -303,14 +303,10 @@ struct wmma_type<WmmaInstr::wmma_f32_16x16x16_f16_gfx12,
 {
     // Absolute fixing property
     // * Data Pixel
-    static constexpr index_t m_per_wmma = 16;
-    static constexpr index_t n_per_wmma = 16;
-    static constexpr index_t k_per_wmma = 16;
-    static constexpr index_t k_per_blk  = 8;
-    // static constexpr index_t src_a_data_size = 2;
-    // static constexpr index_t src_b_data_size = 2;
-    // static constexpr index_t acc_data_size   = 4;
-    // * Thread mapping inside wave, num_thread_per_subgroups always alone N direction
+    static constexpr index_t m_per_wmma               = 16;
+    static constexpr index_t n_per_wmma               = 16;
+    static constexpr index_t k_per_wmma               = 16;
+    static constexpr index_t k_per_blk                = 8;
     static constexpr index_t acc_data_size            = 4;
     static constexpr index_t acc_pack_number          = 1;
     static constexpr index_t num_thread_per_subgroups = n_per_wmma;
diff --git a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp
index 9fb81012ea..104e51de0c 100644
--- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm_arraybase.hpp
@@ -20,11 +20,6 @@ __host__ __device__ static auto
 MakeGridDescriptorPair(const std::array<index_t, NumDimG + NumDimM + NumDimN>& gs_ms_ns_lengths_vec,
                        const std::array<index_t, NumDimG + NumDimM + NumDimN>& gs_ms_ns_strides_vec)
 {
-    // if(!(gs_ms_ns_lengths_vec.size() == NumDimG + NumDimM + NumDimN &&
-    //      gs_ms_ns_strides_vec.size() == NumDimG + NumDimM + NumDimN))
-    // {
-    //     throw std::runtime_error("wrong! dimension must match input lengths");
-    // }
 
     const auto to_tuple = [&](auto& vec, auto start, auto end) {
         return generate_tuple([&](auto i) { return vec[start + i]; }, Number<end - start>{});
diff --git a/include/ck/utility/container_element_picker.hpp b/include/ck/utility/container_element_picker.hpp
index 9de2466e71..cec6c85298 100644
--- a/include/ck/utility/container_element_picker.hpp
+++ b/include/ck/utility/container_element_picker.hpp
@@ -15,9 +15,6 @@ template <typename Arr, typename Picks>
 struct ContainerElementPicker
 {
     using type = ContainerElementPicker;
-#if 0
-    using data_type = typename Arr::data_type;
-#endif
 
     __host__ __device__ constexpr ContainerElementPicker() = delete;
 
@@ -81,9 +78,6 @@ template <typename Arr, typename Picks>
 struct ConstantContainerElementPicker
 {
     using type = ConstantContainerElementPicker;
-#if 0
-    using data_type = typename Arr::data_type;
-#endif
 
     __host__ __device__ constexpr ConstantContainerElementPicker() = delete;
 
diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp
index 00fab270e8..ce4c92425e 100644
--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -361,14 +361,8 @@ struct DynamicBuffer
         {
             if(is_valid_element)
             {
-#if 0
-                X tmp = x;
-
-                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
-#else
                 // if(i >= 2169041600)
                 *c_style_pointer_cast<X*>(&p_data_[i]) = x;
-#endif
             }
         }
     }
diff --git a/include/ck/utility/transpose_vectors.hpp b/include/ck/utility/transpose_vectors.hpp
index de20674ef2..11b503da69 100644
--- a/include/ck/utility/transpose_vectors.hpp
+++ b/include/ck/utility/transpose_vectors.hpp
@@ -18,22 +18,6 @@ struct transpose_vectors;
 // transpose fp16 2x2
 __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t& y0, half2_t& y1)
 {
-#if 0
-    static constexpr auto I0 = Number<0>{};
-    static constexpr auto I1 = Number<1>{};
-
-    const vector_type<half_t, 2> vx0{x0}, vx1{x1};
-    vector_type<half_t, 2> vy0, vy1;
-
-    vy0.template AsType<half_t>()(I0) = vx0.template AsType<half_t>()[I0];
-    vy0.template AsType<half_t>()(I1) = vx1.template AsType<half_t>()[I0];
-
-    vy1.template AsType<half_t>()(I0) = vx0.template AsType<half_t>()[I1];
-    vy1.template AsType<half_t>()(I1) = vx1.template AsType<half_t>()[I1];
-
-    y0 = vy0.template AsType<half2_t>()[I0];
-    y1 = vy1.template AsType<half2_t>()[I0];
-#else
     constexpr int32_t m0 = 0x05040100;
     constexpr int32_t m1 = 0x07060302;
 
@@ -43,7 +27,6 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
     // index is reversed because of little endianness (least significant bits first)
     y0 = bit_cast<half2_t>(__builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m0));
     y1 = bit_cast<half2_t>(__builtin_amdgcn_perm(bit_cast<int32_t>(x1), bit_cast<int32_t>(x0), m1));
-#endif
 }
 
 template <index_t NX, index_t NY>
diff --git a/include/ck/utility/workgroup_barrier.hpp b/include/ck/utility/workgroup_barrier.hpp
index 0e440799be..0be341da88 100644
--- a/include/ck/utility/workgroup_barrier.hpp
+++ b/include/ck/utility/workgroup_barrier.hpp
@@ -12,20 +12,6 @@ struct workgroup_barrier
 
     __device__ uint32_t ld(uint32_t offset)
     {
-#if 0
-        float d = llvm_amdgcn_raw_buffer_load_fp32(
-                        amdgcn_make_buffer_resource(base_ptr),
-                        0,
-                        offset,
-                        AMDGCN_BUFFER_GLC);
-        union cvt {
-            float f32;
-            uint32_t u32;
-        };
-        cvt x;
-        x.f32 = d;
-        return x.u32;
-#endif
         return __atomic_load_n(base_ptr + offset, __ATOMIC_RELAXED);
     }
 
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing.hpp b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
index a32f26dadf..6a9c9e3faf 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2166,27 +2166,11 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
         }
         else if constexpr(N == 8)
         {
-#if 0
-            thread_buffer<fp16_t, 8> tmp{src_thread_data};
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
-                                                static_cast<index_t>(coherence));
-#else
             llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
                                                 static_cast<index_t>(coherence));
-#endif
         }
     }
     else if constexpr(std::is_same<T, bf16_t>::value) // bf16
diff --git a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
index 7d57858f26..8056b76af7 100644
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -1992,27 +1992,11 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
         }
         else if constexpr(N == 8)
         {
-#if 0
-            thread_buffer<fp16_t, 8> tmp{src_thread_data};
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
-                                                static_cast<index_t>(coherence));
-#else
             llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
                                                 dst_wave_buffer_resource,
                                                 dst_thread_addr_offset,
                                                 dst_wave_addr_offset,
                                                 static_cast<index_t>(coherence));
-#endif
         }
     }
     else if constexpr(std::is_same<T, bf16_t>::value) // bf16
diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp
index 45adbded2c..d6ba1efcbe 100644
--- a/include/ck_tile/core/container/array.hpp
+++ b/include/ck_tile/core/container/array.hpp
@@ -84,19 +84,6 @@ struct array
             data[i] = static_cast<value_type>(c);
     }
 
-    // template <typename Y>
-    // CK_TILE_HOST_DEVICE constexpr array(const array& o)
-    // {
-    //     // static_assert(ArrayType::size() == size(), "wrong! size not the same");
-    //     __content = o.__content;
-    // }
-    // CK_TILE_HOST_DEVICE constexpr array& operator=(const array& o)
-    // {
-    //     // static_assert(ArrayType::size() == size(), "wrong! size not the same");
-    //     __content = o.__content;
-    //     return *this;
-    // }
-
     CK_TILE_HOST_DEVICE static constexpr auto size() { return N; }
     CK_TILE_HOST_DEVICE static constexpr bool is_static() { return is_static_v<value_type>; }
 
@@ -247,13 +234,6 @@ CK_TILE_HOST_DEVICE constexpr details::return_type<D, Ts...> make_array(Ts&&...
     return {std::forward<Ts>(ts)...};
 }
 
-// // make empty array
-// template <typename T>
-// CK_TILE_HOST_DEVICE constexpr auto make_array()
-// {
-//     return array<T, 0>{};
-// }
-
 // compatible with old ck's initializer, make an array and fill it withe the last element from
 // initializer_list
 template <typename T, index_t Size>
diff --git a/include/ck_tile/core/container/sequence.hpp b/include/ck_tile/core/container/sequence.hpp
index 73ce09b20e..4e94d6e902 100644
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -480,32 +480,6 @@ struct sequence_split
     using right_type = decltype(Seq::extract(range1{}));
 };
 
-#if 0
-// reverse sequence
-template <typename Seq>
-struct sequence_reverse
-{
-    static constexpr index_t NSize = Seq{}.size();
-
-    using seq_split = sequence_split<Seq, NSize / 2>;
-    using type      = typename sequence_merge<
-        typename sequence_reverse<typename seq_split::right_type>::type,
-        typename sequence_reverse<typename seq_split::left_type>::type>::type;
-};
-
-template <index_t I>
-struct sequence_reverse<sequence<I>>
-{
-    using type = sequence<I>;
-};
-
-template <index_t I0, index_t I1>
-struct sequence_reverse<sequence<I0, I1>>
-{
-    using type = sequence<I1, I0>;
-};
-#endif
-
 namespace detail {
 template <typename Id, index_t... Ns>
 struct seq_reverse;
diff --git a/include/ck_tile/core/container/statically_indexed_array.hpp b/include/ck_tile/core/container/statically_indexed_array.hpp
index d35934ab04..111b8a8c58 100644
--- a/include/ck_tile/core/container/statically_indexed_array.hpp
+++ b/include/ck_tile/core/container/statically_indexed_array.hpp
@@ -24,18 +24,4 @@ using statically_indexed_array = array<T, N>;
 #endif
 
 // consider always use ck_tile::array for this purpose
-#if 0
-template <typename X, typename... Xs>
-CK_TILE_HOST_DEVICE constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs)
-{
-    return statically_indexed_array<X, sizeof...(Xs) + 1>(x, static_cast<X>(xs)...);
-}
-
-// make empty statically_indexed_array
-template <typename X>
-CK_TILE_HOST_DEVICE constexpr auto make_statically_indexed_array()
-{
-    return statically_indexed_array<X, 0>();
-}
-#endif
 } // namespace ck_tile
diff --git a/include/ck_tile/core/container/thread_buffer.hpp b/include/ck_tile/core/container/thread_buffer.hpp
index a955b7f84f..58e417a612 100644
--- a/include/ck_tile/core/container/thread_buffer.hpp
+++ b/include/ck_tile/core/container/thread_buffer.hpp
@@ -23,18 +23,6 @@ CK_TILE_HOST_DEVICE constexpr auto make_thread_buffer(Ts&&... ts)
 }
 #else
 
-#if 0
-template <typename T, index_t N>
-using thread_buffer = array<T, N>;
-
-template <typename... Ts>
-CK_TILE_HOST_DEVICE constexpr auto make_thread_buffer(Ts&&... ts)
-{
-    return make_array(ts...);
-}
-
-#endif
-
 // clang-format off
 template<typename T_, index_t N_>
 struct thread_buffer {
@@ -103,25 +91,6 @@ struct thread_buffer {
         return vx.data;
     }
 
-#if 0
-    template <typename X_,
-              index_t Is,
-              typename std::enable_if<has_same_scalar_type<value_type, X_>::value, bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void _set_as(number<Is> is, X_ x)
-    {
-        using X = remove_cvref_t<X_>;
-
-        constexpr index_t kSPerX = vector_traits<X>::vector_size;
-
-        union {
-            X_ data;
-            tuple_array<value_type, kSPerX> sub_data;
-        } vx {x};
-
-        static_for<0, kSPerX, 1>{}(
-           [&](auto j) { operator()((is * number<sizeof(X_)/sizeof(value_type)>{}) + j) = vx.sub_data[j]; });
-    }
-#endif
 
 
 #define TB_COMMON_AS() \
diff --git a/include/ck_tile/core/container/tuple.hpp b/include/ck_tile/core/container/tuple.hpp
index 97d5ae10df..d7da0e1467 100644
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -292,9 +292,6 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
     // below function should be used under tuple_array<> type, no extra check will perform here
     template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as()                            { return reinterpret_cast<tuple_array<Tx, size()>&>(*this); }
     template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as() const                      { return reinterpret_cast<const tuple_array<Tx, size()>&>(*this); }
-    // below index is for index *AFTER* type convert, not before
-    //template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(index_t i)                   { TP_COM_(); return reinterpret_cast<tuple_array<Tx, size()>&>(*this).at(i); }
-    //template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(index_t i) const             { TP_COM_(); return reinterpret_cast<const tuple_array<Tx, size()>&>(*this).at(i); }
     template <typename Tx, index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(number<I>)        { TP_COM_(); return reinterpret_cast<tuple_array<Tx, size()>&>(*this).at(number<I>{}); }
     template <typename Tx, index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(number<I>) const  { TP_COM_(); return reinterpret_cast<const tuple_array<Tx, size()>&>(*this).at(number<I>{}); }
 
@@ -333,13 +330,6 @@ struct vector_traits<tuple<T...>, void>
     static constexpr index_t vector_size = sizeof...(T);
 };
 
-// template <class... T>
-// CK_TILE_HOST_DEVICE constexpr
-// tuple<T...>
-// make_tuple(T const&... t)
-// {
-//     return {t...};
-// }
 template <typename... Xs>
 CK_TILE_HOST_DEVICE constexpr bool operator==(const tuple<Xs...>& a, const tuple<Xs...>& b)
 {
diff --git a/include/ck_tile/core/numeric/half.hpp b/include/ck_tile/core/numeric/half.hpp
index b6a7e86d3c..c172f48cad 100644
--- a/include/ck_tile/core/numeric/half.hpp
+++ b/include/ck_tile/core/numeric/half.hpp
@@ -264,93 +264,6 @@ bool operator>(const half_t& x, const half_t& y) { return __hgt(x.to_fp16(), y.t
 CK_TILE_DEVICE
 bool operator>=(const half_t& x, const half_t& y) { return __hge(x.to_fp16(), y.to_fp16()); }
 
-#if 0
-CK_TILE_DEVICE
-half_t operator+(const half_t& x, const half_t& y)
-{
-    return half_t(__hadd(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t operator-(const half_t& x) { return half_t(__hneg(x.to_fp16())); }
-
-CK_TILE_DEVICE
-half_t operator-(const half_t& x, const half_t& y)
-{
-    return half_t(__hsub(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t operator*(const half_t& x, const half_t& y)
-{
-    return half_t(__hmul(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t operator/(const half_t& x, const half_t& y)
-{
-    return half_t(__hdiv(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t& operator+=(half_t& x, const half_t& y)
-{
-    x = half_t(__hadd(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator-=(half_t& x, const half_t& y)
-{
-    x = half_t(__hsub(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator*=(half_t& x, const half_t& y)
-{
-    x = half_t(__hmul(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator/=(half_t& x, const half_t& y)
-{
-    x = half_t(__hdiv(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator++(half_t& x)
-{
-    x = half_t(__hadd(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator--(half_t& x)
-{
-    x = half_t(__hsub(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t operator++(half_t& x, int)
-{
-    half_t y(x);
-    x = half_t(__hadd(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return y;
-}
-
-CK_TILE_DEVICE
-half_t operator--(half_t& x, int)
-{
-    half_t y(x);
-    x = half_t(__hsub(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return y;
-}
-#endif
-
 #if CK_TILE_USE_CUSTOM_DATA_TYPE
 CK_TILE_ARITHMETIC_USING_FLOAT(CK_TILE_HOST, half_t)
 #endif
diff --git a/include/ck_tile/core/numeric/int8.hpp b/include/ck_tile/core/numeric/int8.hpp
index aa9f820c17..7b0f102f2b 100644
--- a/include/ck_tile/core/numeric/int8.hpp
+++ b/include/ck_tile/core/numeric/int8.hpp
@@ -73,27 +73,6 @@ struct numeric<int8_t>
     CK_TILE_HOST_DEVICE static constexpr int8_t zero() { return 0; }
 };
 
-#if 0
-
-template <>
-struct numeric_traits<int8_t>
-{
-    static constexpr int exp            = 5;
-    static constexpr int mant           = 10;
-    static constexpr int bias           = 15;
-    static constexpr uint16_t nan_mask  = 0x7C00;
-    static constexpr uint16_t head_mask = 0xFC00;
-    static constexpr uint16_t mant_mask = 0x3FF;
-    static constexpr uint16_t exp_mask  = 0x1F;
-    static constexpr uint32_t Inf       = 0x7C00;
-    static constexpr uint32_t NegInf    = 0xFC00;
-    static constexpr uint32_t NaN       = 0x7C01;
-    static constexpr uint32_t Neg0      = 0x8000;
-    static constexpr int PackedSize           = 1;
-    using bitwise_type                  = uint16_t;
-};
-#endif
-
 CK_TILE_HOST_DEVICE
 constexpr float int8_to_float(const int8_t& x) { return static_cast<float>(x); }
 
diff --git a/include/ck_tile/core/tensor/sweep_tile.hpp b/include/ck_tile/core/tensor/sweep_tile.hpp
index 1947ce0289..35440f10f8 100644
--- a/include/ck_tile/core/tensor/sweep_tile.hpp
+++ b/include/ck_tile/core/tensor/sweep_tile.hpp
@@ -295,10 +295,6 @@ struct tile_sweeper
     F f;
 };
 
-// partial deduction is not allowed
-// template <typename T, typename F, typename U>
-// tile_sweeper(const F&, U = {})->tile_sweeper<T, F, U>;
-
 // deduction guide
 template <typename T,
           typename F,
diff --git a/include/ck_tile/core/tensor/tile_distribution.hpp b/include/ck_tile/core/tensor/tile_distribution.hpp
index aa5714e5c2..beafc03115 100644
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -454,45 +454,6 @@ struct tile_distribution_detail
 
 } // namespace detail
 
-#if 0
-// this returns a constexpr tile_distribution
-template <typename StaticTileDistributionEncoding_>
-CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistributionEncoding_)
-{
-    using DstrEncode = remove_cvref_t<StaticTileDistributionEncoding_>;
-
-    constexpr auto adaptor_impl =
-        detail::make_adaptor_encoding_for_tile_distribution(StaticTileDistributionEncoding_{});
-
-    constexpr auto ps_ys_to_xs_adaptor_impl          = adaptor_impl.template at<0>();
-    constexpr auto ys_to_d_adaptor_impl              = adaptor_impl.template at<1>();
-    constexpr index_t d_length                       = adaptor_impl.template at<2>();
-    constexpr auto rh_major_minor_to_hidden_ids_impl = adaptor_impl.template at<3>();
-
-    constexpr auto ps_ys_to_xs_adaptor =
-        CONSTRUCT_TENSOR_ADAPTOR_FROM_ENCODING(ps_ys_to_xs_adaptor_impl);
-
-    constexpr auto ys_to_d_adaptor = CONSTRUCT_TENSOR_ADAPTOR_FROM_ENCODING(ys_to_d_adaptor_impl);
-
-    constexpr auto ys_to_d_descriptor =
-        make_tensor_descriptor_from_adaptor(ys_to_d_adaptor, d_length);
-
-    //
-    constexpr index_t ndim_rh_major = DstrEncode::detail::ndim_rh_major_;
-    constexpr auto ndims_rhs_minor  = DstrEncode::detail::ndims_rhs_minor_;
-
-    constexpr auto rh_major_minor_to_hidden_ids =
-        TO_TUPLE_OF_SEQUENCE(rh_major_minor_to_hidden_ids_impl, ndim_rh_major, ndims_rhs_minor);
-
-    return tile_distribution<
-        remove_cvref_t<decltype(ps_ys_to_xs_adaptor)>,
-        remove_cvref_t<decltype(ys_to_d_descriptor)>,
-        remove_cvref_t<DstrEncode>,
-        detail::tile_distribution_detail<remove_cvref_t<decltype(rh_major_minor_to_hidden_ids)>>>{
-        ps_ys_to_xs_adaptor, ys_to_d_descriptor};
-}
-#endif
-
 // this returns a static tile_distribution
 template <typename StaticTileDistributionEncoding_>
 CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistributionEncoding_)
diff --git a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
index 4ad699629c..4e971649d0 100644
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -745,14 +745,6 @@ struct PassThroughPack2
     template <typename Y, typename X>
     CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;
 
-#if 0
-    CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::fp16x2_t& y, const ck_tile::f8x2_t& x) const
-    {
-        auto t = type_convert<float2_t>(x);
-        y      = type_convert<fp16x2_t>(t);
-    }
-#endif
-
     CK_TILE_HOST_DEVICE constexpr void operator()(fp16x2_t& y, const pk_int4_t& x) const
     {
         uint8_t x_u8 = bit_cast<uint8_t>(x);
@@ -871,61 +863,6 @@ struct UnaryConvert
     }
 };
 
-#if 0
-struct ConvertBF16RTN
-{
-    // convert to bf16 using round to nearest (rtn)
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
-    {
-        // check Y datatype
-        static_assert(std::is_same_v<Y, ck_tile::bf16_t>, "Data type is not supported by this operation!");
-
-        // check X datatype
-        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
-                      "Data type is not supported by this operation!");
-
-        y = bf16_convert_rtn<Y>(x);
-    }
-};
-
-struct ConvertF8SR
-{
-    // convert to fp8 using stochastic rounding (SR)
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
-    {
-        // check Y datatype
-        static_assert(std::is_same_v<Y, ck_tile::fp8_t> || std::is_same_v<Y, ck_tile::bf8_t>,
-                      "Data type is not supported by this operation!");
-
-        // check X datatype
-        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
-                      "Data type is not supported by this operation!");
-
-        y = f8_convert_sr<Y>(x);
-    }
-};
-
-struct ConvertF8RNE
-{
-    // convert to fp8 using rounding to nearest even
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
-    {
-        // check Y datatype
-        static_assert(std::is_same_v<Y, ck_tile::fp8_t> || std::is_same_v<Y, ck_tile::bf8_t>,
-                      "Data type is not supported by this operation!");
-
-        // check X datatype
-        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
-                      "Data type is not supported by this operation!");
-
-        y = f8_convert_rne<Y>(x);
-    }
-};
-#endif
-
 struct Scale
 {
     static constexpr const char* name = "Scale";
diff --git a/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp
index ae33137459..ff96139f18 100644
--- a/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp
@@ -339,16 +339,6 @@ struct GroupedFlatmmKernel : FlatmmKernel<TilePartitioner_, FlatmmPipeline_, Epi
     {
         return hostArgs;
     }
-    // CK_TILE_HOST static constexpr auto
-    // MakeKernelArgs(const ContiguousGroupedFlatmmHostArgs& hostArgs)
-    // {
-    //     return hostArgs;
-    // }
-    // CK_TILE_HOST static constexpr auto
-    // MakeKernelArgs(const MaskedGroupedFlatmmHostArgs& hostArgs)
-    // {
-    //     return hostArgs;
-    // }
 
     template <class ScaleM       = FlatmmScalePointer<-1>,
               class ScaleN       = FlatmmScalePointer<-1>,
diff --git a/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp b/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp
index 81cf76cb07..6721577018 100644
--- a/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp
@@ -483,13 +483,6 @@ struct MoeFlatmmKernel
 
         if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
         {
-            // if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
-            // {
-            //     std::cerr << "Can't support N that is not a multiple of NPerBlock"
-            //                  " without padding!"
-            //               << std::endl;
-            //     return false;
-            // }
             if(kargs.N % FlatmmPipeline::GetVectorSizeB() != 0)
             {
                 std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl;
diff --git a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
index 76d191a40c..99c35e9f30 100644
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -392,10 +392,6 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
         constexpr index_t M1 = BlockSize / get_warp_size();
         static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
         static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
-        // constexpr index_t M0 = MPerBlock / (M2 * M1);
-        // static_assert(M0 * M1 * M2 == MPerBlock,
-        //                 "Incorrect M0, M2, M1 configuration! "
-        //                 "M0, M1, M2 must cover whole MPerBlock!");
 
         return make_static_tile_distribution(
             tile_distribution_encoding<sequence<1>,
diff --git a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index 0f7f742fa0..6e6547b837 100644
--- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -1151,11 +1151,6 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
                             a_warp_tensor(number<AwarpIter>{}) =
                                 load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
                         }
-                        // barrier
-                        // if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                        // {
-                        //     block_sync_lds();
-                        // }
                     });
                 }
             });
@@ -1636,10 +1631,6 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1
                                     ? Aload_rep
                                     : 0;
                 }
-                // if((kIter % KPerScaleLoad == 0) && (mIter == 0))
-                // {
-                //     load_perM = load_perM + 1;
-                // }
                 SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
             }
         }
diff --git a/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp b/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
index 543f4dc92a..fd1bb6da5a 100644
--- a/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
@@ -103,13 +103,8 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
     static constexpr index_t Aload_num_perK = dswrite_num_perK;
     static constexpr index_t Aload_rep      = dswrite_rep;
     static constexpr index_t Bload_num_perK = kNPerBlock * WG::kK / NWarp / BK1 / WaveSize;
-    // static constexpr index_t ScaleBload_K1  = ContinuousScaleNPerThread *
-    // ContinuousScaleKPerThread; static constexpr index_t ScaleBload_num =
-    //     kNPerBlock * kKPerBlock / NWarp / 32 / ScaleBload_K1 /
-    //     WaveSize; // BlockN * BlockK / NWarp / ScalePerK / ScaleB_K1 / wavesize
-    // static constexpr index_t KPerScaleLoad = KIterPerWarp / ScaleBload_num;
-    static constexpr index_t HalfMIter = (MIterPerWarp + 1) / 2;
-    static constexpr index_t Bload_rep = (Bload_num_perK + HalfMIter - 1) / HalfMIter;
+    static constexpr index_t HalfMIter      = (MIterPerWarp + 1) / 2;
+    static constexpr index_t Bload_rep      = (Bload_num_perK + HalfMIter - 1) / HalfMIter;
 
     static constexpr index_t mfma_perM_perK = NIterPerWarp * mfma_per_wg;
     static constexpr index_t dswrite_mIter  = (DsWritePreIssue - 1) % MIterPerWarp;
@@ -352,10 +347,6 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
                                     ? Aload_rep
                                     : 0;
                 }
-                // if((kIter % KPerScaleLoad == 0) && (mIter == 0))
-                // {
-                //     load_perM = load_perM + 1;
-                // }
                 SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
             }
         }
diff --git a/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp b/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
index f698541dbf..cef66e470f 100644
--- a/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -390,10 +390,6 @@ struct MXFlatmmPipelineAGmemBGmemCRegV1 : FlatmmPipelineAGmemBGmemCRegV1<Problem
                                     ? Aload_rep
                                     : 0;
                 }
-                // if((kIter % KPerScaleLoad == 0) && (mIter == 0))
-                // {
-                //     load_perM = load_perM + 1;
-                // }
                 SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
             }
         }
diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
index 581bcc19d4..ec62e52d0e 100644
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -692,9 +692,6 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
         constexpr index_t LaneGroups = WarpSize / LanesPerK; // within a wave
         constexpr index_t NumIssues  = kNPerBlock / (LaneGroups * NumWarps);
         static_assert(NumIssues == kNPerBlock * kKPerBlock / (kBlockSize * KVector));
-        // constexpr index_t SingleKSize = NumIssues * NumWarps * (WarpSize * KVector + kPad);
-        // constexpr index_t SingleVSize =
-        // MakeVLdsBlockDescriptor<Problem>().get_element_space_size();
         constexpr index_t BufferSize =
             GetSingleSmemElementSpaceSize<Problem>(); //  max(SingleKSize, SingleVSize);
 
diff --git a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
index 52b2b86574..06ab134f85 100644
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -456,9 +456,6 @@ struct MoeSortingKernel
     template <typename T, typename F, index_t wave_size_ = get_warp_size()>
     __device__ static constexpr T wave_reduce(T local, F reduce_f, number<wave_size_> = {})
     {
-        // constexpr int wave_size = 64;
-        // constexpr int reduce_stage = 6; // 1<<6=64
-        // clang-format off
         constexpr int reduce_stage = [](){
             if constexpr(wave_size_ == 2) return 1;
             else if constexpr(wave_size_ == 4) return 2;
@@ -1206,17 +1203,21 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size()
 template <typename T, typename F, index_t wave_size_ = get_warp_size()>
 CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
 {
-    // constexpr int wave_size = 64;
-    // constexpr int reduce_stage = 6; // 1<<6=64
-    // clang-format off
-    constexpr int reduce_stage = [](){
-        if constexpr(wave_size_ == 2) return 1;
-        else if constexpr(wave_size_ == 4) return 2;
-        else if constexpr(wave_size_ == 8) return 3;
-        else if constexpr(wave_size_ == 16) return 4;
-        else if constexpr(wave_size_ == 32) return 5;
-        else if constexpr(wave_size_ == 64) return 6;
-        else return 0;
+    constexpr int reduce_stage = []() {
+        if constexpr(wave_size_ == 2)
+            return 1;
+        else if constexpr(wave_size_ == 4)
+            return 2;
+        else if constexpr(wave_size_ == 8)
+            return 3;
+        else if constexpr(wave_size_ == 16)
+            return 4;
+        else if constexpr(wave_size_ == 32)
+            return 5;
+        else if constexpr(wave_size_ == 64)
+            return 6;
+        else
+            return 0;
     }();
     // clang-format on
     T v_local = local;
@@ -3047,53 +3048,6 @@ struct MoeSortingMultiPhaseKernel_P23
                 x_r = x_v;
 #endif
                 {
-#if 0
-#pragma unroll
-                    for(int j = 0; j < index_pack / 2; j++)
-                    {
-                        int i_token = i * kBlockSize * index_pack + threadIdx.x + j * kBlockSize;
-                        index_t x   = x_d[j];
-                        int i_topk  = x - 1;          // topk of this token
-                        int i_show  = x != 0 ? 1 : 0; // has this token or not
-                        int cumsum  = i_show;
-                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
-
-                        __syncthreads();
-                        if(lane_id == get_warp_size() - 1)
-                        {
-                            s[4 + wave_id] = cumsum;
-                        }
-                        __syncthreads();
-
-                        // reduce cross wave
-                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
-                            IndexType prev = s[4 + i_w];
-                            prev           = wave_id > i_w ? prev : 0; // mask out
-                            cumsum += prev;
-                        });
-                        cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == kBlockSize - 1)
-                        {
-                            s[0] = cumsum;
-                        }
-                        __syncthreads();
-
-                        int position = cumsum - i_show;
-                        prev_cumsum  = s[0]; // update the last cumsum
-
-                        if(i_show)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position] =
-                                MOE_SORTING_MOCK_ID(i_token, i_topk);
-#else
-                            p_sorted_token_ids[e_start + position] = i_token;
-#endif
-                            p_sorted_weights[e_start + position] =
-                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk];
-                        }
-                    }
-#endif
                     {
                         d_t i_topk;
                         d_t i_show;
@@ -3151,68 +3105,6 @@ struct MoeSortingMultiPhaseKernel_P23
                             }
                             position += i_show[j];
                         });
-
-#if 0
-                        int i_token = i * kBlockSize * index_pack + threadIdx.x * 2 + j * kBlockSize * 2;
-                        index_t x   = x_d[j];
-                        index_t x0  = static_cast<index_t>(x & 0xffff);
-                        index_t x1  = static_cast<index_t>(x >> 16);
-                        int i_topk_0  = x0 - 1;          // topk of this token
-                        int i_show_0  = x0 != 0 ? 1 : 0; // has this token or not
-                        int i_topk_1  = x1 - 1;          // topk of this token
-                        int i_show_1  = x1 != 0 ? 1 : 0; // has this token or not
-                        int cumsum  = i_show_0 + i_show_1;
-                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
-
-                        __syncthreads();
-                        if(lane_id == get_warp_size() - 1)
-                        {
-                            s[4 + wave_id] = cumsum;
-                        }
-                        __syncthreads();
-
-                        // reduce cross wave
-                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
-                            IndexType prev = s[4 + i_w];
-                            prev           = wave_id > i_w ? prev : 0; // mask out
-                            cumsum += prev;
-                        });
-                        cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == kBlockSize - 1)
-                        {
-                            s[0] = cumsum;
-                        }
-                        __syncthreads();
-
-                        int position_0 = cumsum - i_show_0 - i_show_1;
-                        prev_cumsum  = s[0]; // update the last cumsum
-
-                        if(i_show_0)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position_0] =
-                                MOE_SORTING_MOCK_ID(i_token, i_topk_0);
-#else
-                            p_sorted_token_ids[e_start + position_0] = i_token;
-#endif
-                            p_sorted_weights[e_start + position_0] =
-                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk_0];
-                        }
-
-                        int position_1 = cumsum - i_show_1;
-
-                        if(i_show_1)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position_1] =
-                                MOE_SORTING_MOCK_ID(i_token + 1, i_topk_1);
-#else
-                            p_sorted_token_ids[e_start + position_1] = i_token + 1;
-#endif
-                            p_sorted_weights[e_start + position_1] =
-                                p_weights[(i_token + 1) * kargs.topk_mdiv.divisor + i_topk_1];
-                        }
-#endif
                     }
                 }
             }
diff --git a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
index f70f4ddacc..828847091a 100644
--- a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
@@ -14,14 +14,6 @@
 
 namespace ck_tile {
 
-// template <typename Problem_, typename Policy_ = MoeSortingPolicy>
-// struct MoeSortingPipeline
-// {
-//     // TODO: this kernel only support warp per row
-//     using Problem    = remove_cvref_t<Problem_>;
-//     using Policy     = remove_cvref_t<Policy_>;
-//     using WeightType = typename Problem::WeightType;
-
 //     template <typename TopkIdWindow, typename WeightWindow>
 //     CK_TILE_DEVICE auto operator()(const TopkIdWindow& topk_id_window,
 //                                    const WeightWindow& weight_window,
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
index a7f1cef519..1a61b69b34 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
@@ -36,9 +36,6 @@ struct BlockGemmARegBSmemCRegOneWarpV1
                 std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
             "wrong!");
 
-        // constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}];
-        // constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
-        // constexpr index_t KPerBlock = ABlockTensorTmp{}.get_lengths()[number<1>{}];
         constexpr index_t MPerBlock = BlockGemmShape::kM;
         constexpr index_t NPerBlock = BlockGemmShape::kN;
         constexpr index_t KPerBlock = BlockGemmShape::kK;
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
index 2280f6f875..3a7c0362f7 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
@@ -19,30 +19,7 @@ struct BlockGemmARegBSmemCRegV1DefaultPolicy
                      std::is_same_v<typename Problem::BDataType, half_t> &&
                      std::is_same_v<typename Problem::CDataType, float>)
         {
-#if 0
-            constexpr index_t kBlockSize = Problem::kBlockSize;
-
-            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-            constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-            static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
-
-            constexpr index_t NumWarp = kBlockSize / get_warp_size();
-
-            // FIXME
-            if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
-                         kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-            else
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-#else
             return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
-#endif
         }
         else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
                           std::is_same_v<typename Problem::BDataType, bf16_t> &&
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp
index b8290c95d8..0b1cea9425 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp
@@ -16,30 +16,7 @@ struct BlockGemmARegBSmemCRegV2DefaultPolicy
     CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
     {
 
-#if 0
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
-
-        constexpr index_t NumWarp = kBlockSize / get_warp_size();
-
-        // FIXME
-        if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
-                     kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
-        {
-            return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-        }
-        else
-        {
-            return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-        }
-#else
         return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
-#endif
     }
 };
 
diff --git a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
index 29022e764f..0622cc624f 100644
--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
@@ -19,30 +19,7 @@ struct BlockGemmASmemBRegCRegV1DefaultPolicy
                      std::is_same_v<typename Problem::BDataType, half_t> &&
                      std::is_same_v<typename Problem::CDataType, float>)
         {
-#if 0
-            constexpr index_t kBlockSize = Problem::kBlockSize;
-
-            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-            constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-            static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
-
-            constexpr index_t NumWarp = kBlockSize / get_warp_size();
-
-            // FIXME
-            if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
-                         kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-            else
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-#else
             return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
-#endif
         }
         else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
                           std::is_same_v<typename Problem::BDataType, bf16_t> &&
diff --git a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
index da9c5c4d57..717fb4678c 100644
--- a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
+++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
@@ -120,10 +120,6 @@ struct BlockNormReduceSync
 
         constexpr index_t idim_p_lane = NDimP - 1;
 
-        // const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
-        // const auto rs_idx =
-        //     mean_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
-
         constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
         static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size());
 
@@ -360,17 +356,6 @@ struct BlockNormReduceCrossWarpSync
 template <typename BlockShape>
 CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_size)
 {
-#if 0
-    using S                   = BlockShape;
-    index_t LastloopN         = row_size % S::Block_N == 0 ? S::Block_N : row_size % S::Block_N;
-    constexpr index_t NThread = S::WarpPerBlock_N * S::ThreadPerWarp_N;
-    index_t iNLane            = get_thread_id() % NThread;
-    index_t iN0               = LastloopN / (S::Vector_N * S::ThreadPerWarp_N);
-    index_t iN1               = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) / S::Vector_N;
-    index_t N2                = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) % S::Vector_N;
-    index_t iN3               = iNLane < iN1 ? S::Vector_N : iNLane == iN1 ? N2 : 0;
-    return iN0 * S::Vector_N + iN3;
-#endif
     using S_                            = BlockShape;
     constexpr index_t ThreadsPerBlock_N = S_::WarpPerBlock_N * S_::ThreadPerWarp_N;
 
diff --git a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
index abad5ed031..a14f103eb6 100644
--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -140,28 +140,6 @@ struct BlockReduce2d
                                            ReducePacksPerXDim{});
     }
 
-#if 0
-        constexpr auto I0 = number<0>{};
-        constexpr auto I1 = number<1>{};
-        constexpr auto spans = XDistributedTensor_::get_distributed_spans();
-
-        // FIXME: hard coded to reduce 2nd axis
-        sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) {
-            constexpr auto y_dstr_idx = make_tuple(dstr_idx_i0);
-
-            auto y = y_tensor[y_dstr_idx];
-
-            sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) {
-                constexpr auto in_dstr_idx = make_tuple(dstr_idx_i0, dstr_idx_i1);
-                const auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
-
-                y = reduce_func(y, x);
-            });
-
-            y_tensor(y_dstr_idx) = y;
-        });
-#endif
-
     template <typename XDistributedTensor_>
     CK_TILE_DEVICE static auto MakeYBlockTile()
     {
@@ -240,10 +218,6 @@ struct BlockReduce2dSync
 
         constexpr index_t idim_p_lane = NDimP - 1;
 
-        // const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
-        // const auto rs_idx =
-        //     y_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
-
         constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size();
 
         // loop over thread data
diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
index f46ebbacf7..348216129f 100644
--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_streamk.hpp
@@ -52,66 +52,6 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGemmSt
     static auto GetInstances()
     {
         std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-#if 0
-        if constexpr(is_same_v<ADataType, float> && is_same_v<BDataType, float> &&
-                     is_same_v<CDataType, float>)
-        {
-            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
-                         is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f32_f32_f32_mk_kn_mn_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f32_f32_f32_km_kn_mn_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f32_f32_f32_km_nk_mn_instances(op_ptrs);
-            }
-        }
-        else if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
-                          is_same_v<CDataType, half_t>)
-        {
-            if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Row> &&
-                         is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_irregular_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v1_interwave_irregular_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v2_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_kn_mn_v2_irregular_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Row> && is_same_v<BLayout, Col> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_irregular__instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v1_interwave_irregular_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v2_instances(op_ptrs);
-                add_device_gemm_xdl_splitk_f16_f16_f16_mk_nk_mn_v2_irregular_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Row> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f16_f16_f16_km_kn_mn_instances(op_ptrs);
-            }
-            else if constexpr(is_same_v<ALayout, Col> && is_same_v<BLayout, Col> &&
-                              is_same_v<CLayout, Row>)
-            {
-                add_device_gemm_xdl_splitk_f16_f16_f16_km_nk_mn_instances(op_ptrs);
-            }
-        }
-#endif
         if constexpr(is_same_v<ADataType, half_t> && is_same_v<BDataType, half_t> &&
                      is_same_v<CDataType, half_t>)
         {
diff --git a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
index 3d391ae931..a8d69afb9a 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_b_scale/device_gemm_b_scale_xdl_f16_i4_f16/device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn.hpp
@@ -33,12 +33,6 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
-#if 0
-template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_b_scale_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple<
-
-#endif
-
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_b_scale_xdl_f16_i4_f16_mk_nk_mn_mem_instances = std::tuple<
     // clang-format off
diff --git a/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp b/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp
index 600154a9fd..919236deee 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_streamk/device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_instance.cpp
@@ -26,9 +26,6 @@ using S = ck::Sequence<Is...>;
 
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 
-// static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
-// static constexpr auto GemmMNPadding =
-// ck::tensor_operation::device::GemmSpecialization::MNPadding;
 using device_gemm_xdl_streamk_f16_f16_f16_mk_kn_mn_generic_instances = std::tuple<
     // clang-format off
         //##################|AData| BData| CData| AccData| ALayout| BLayout| CLayout|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
index 8ba6c485cb..99e809f0ec 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_bf16_i4_bf16/device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn.hpp
@@ -33,25 +33,6 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
-#if 0
-template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
-        // Compute friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     BF16,    I4,  BF16,   F32,     BF16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
-#endif
-
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_bf16_i4_bf16_mk_nk_mn_mem_instances = std::tuple<
     // clang-format off
diff --git a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
index 088378b918..c52b9723a9 100644
--- a/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_universal/device_gemm_xdl_universal_f16_i4_f16/device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn.hpp
@@ -33,25 +33,6 @@ static constexpr auto GemmMNKPadding = GemmSpecialization::MNKPadding;
 static constexpr auto Intrawave = BlockGemmPipelineScheduler::Intrawave;
 static constexpr auto Interwave = BlockGemmPipelineScheduler::Interwave;
 
-#if 0
-template <GemmSpecialization GemmSpec>
-using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_comp_instances = std::tuple<
-    // clang-format off
-        //#########################| ALayout| BLayout| CLayout|AData| BData| CData| AccData| Cshuffle|           A|           B|           C|          GEMM| Block|  MPer|  NPer|  KPer| AK1| BK1|MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle|     CBlockTransferClusterLengths|  CBlockTransfer|                         Block-wiseGemm|               Block-wiseGemm|
-        //#########################|        |        |        | Type|  Type|  Type|    Type|     Type| Elementwise| Elementwise| Elementwise|Specialization|  Size| Block| Block| Block|    |    | XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MXdlPerWave_MWaveMPerXdl| ScalarPerVector|                               Pipeline|                     Pipeline|
-        //#########################|        |        |        |     |      |      |        |         |   Operation|   Operation|   Operation|              |      |      |      |      |    |    |    |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle| _NBlock_NXdlPerWave_NWaveNPerXdl|   _NWaveNPerXdl|                              Scheduler|                     Verision|
-        //#########################|        |        |        |     |      |      |        |         |            |            |            |              |      |      |      |      |    |    |    |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                                 |                |                                       |                             |
-        
-        // Compute friendly
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v4>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   224,   256,    64,   8,  16,  16,   16,    7,    8,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           2,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v3>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v5>,
-        DeviceGemm_Xdl_CShuffleV3<  Row,     Col,     Row,     F16,    I4,  F16,   F32,     F16,      PassThrough, PassThrough, PassThrough,       GemmSpec,   256,   128,   128,    64,   8,  16,  32,   32,    2,    2,     S<8, 32, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,              8,              8,          0,    S<4, 64, 1>,     S<1, 0, 2>,    S<1, 0, 2>,               2,            16,             16,          0,          1,           1,                   S<1, 32, 1, 8>,               8,  BlockGemmPipelineScheduler::Interwave, BlockGemmPipelineVersion::v1>
-    // clang-format on
-    >;
-#endif
-
 template <BlockGemmPipelineScheduler BlkGemmPipeSched, GemmSpecialization GemmSpec>
 using device_gemm_xdl_universal_f16_i4_f16_mk_nk_mn_mem_instances =
     std::tuple<
diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp
index 096a2d4eb4..31cc0fd23a 100644
--- a/profiler/src/profile_softmax.cpp
+++ b/profiler/src/profile_softmax.cpp
@@ -278,11 +278,4 @@ int profile_softmax(int argc, char* argv[])
     return 0;
 }
 
-// hijack main() for quick debugging
-// int main(int argc, char* argv[])
-// {
-//     profile_normalization(argc, argv);
-//     return 0;
-// }
-
 REGISTER_PROFILER_OPERATION("softmax", "Softmax", profile_softmax);
diff --git a/test/block_swizzle_test/block_swizzle_test.cpp b/test/block_swizzle_test/block_swizzle_test.cpp
index 36a26492cf..af1bc0658e 100644
--- a/test/block_swizzle_test/block_swizzle_test.cpp
+++ b/test/block_swizzle_test/block_swizzle_test.cpp
@@ -120,17 +120,7 @@ struct block_dispatcher_t
 
     uint32_t get_grid_dims_x() { return dp_start_block_idx + dp_num_blocks; }
 
-    uint32_t get_block_idx(uint32_t bid)
-    {
-        // block id is linearily allocated along sk blocks (dp blocks are fine)
-        // this function will compute blockIdx.x and the linear sk block mapping
-        // uint32_t block_idx = 0;
-        // if(bid < sk_num_big_blocks) {
-        //     uint32_t current_k_iter = bid * k_iters_per_big_block;
-        //     tile_idx = current_k_iter / k_iters_per_tile;
-        // }
-        return bid;
-    }
+    uint32_t get_block_idx(uint32_t bid) { return bid; }
 
     uint32_t get_current_itr(uint32_t block_idx)
     {
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
index 8f4813a47e..ca49114844 100644
--- a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::bf16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
diff --git a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
index e357d7e3ac..f754d8e959 100644
--- a/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
+++ b/test/ck_tile/add_rmsnorm2d_rdquant/instances/add_rmsnorm2d_rdquant_fwd_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                                rm  rn  tm  tn  vn  pd      x     3p
-#if 0
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true , true, false>>(const S&, A);
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true , true, false>>(const S&, A);
-
-template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true , true, false>>(const S&, A);
-#endif
 
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 1, 2,  128, 8,  true,  true, false>>(const S&, A);
 template float add_rmsnorm2d_rdquant_fwd_<trait_<ck_tile::fp16_t,  ck_tile::int8_t, 1, 2, 2,  128, 4,  true,  true, false>>(const S&, A);
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
index 8c72b81dc1..56fcca3beb 100644
--- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::bf16_t, ck_tile::int8_t, 1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
index 6d7a5e7c1f..2462cd218e 100644
--- a/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
+++ b/test/ck_tile/moe_smoothquant/instances/moe_smoothquant_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float moe_smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float moe_smoothquant_<trait_<ck_tile::fp16_t, ck_tile::int8_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
index 8a5e0c74a0..66f427247a 100644
--- a/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
+++ b/test/ck_tile/smoothquant/instances/smoothquant_bf16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn  tm   tn  vn   pd   2p
-#if 0
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  2,  4,  64, 8,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  4,  4,  64, 4,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  8,  4,  64, 2,  true, false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::bf16_t, 1, 16,  4,  64, 1,  true, false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::bf16_t, 1,  1,  1, 256, 4,  true, false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::bf16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);
diff --git a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
index 9c08cf64f0..103f7281b0 100644
--- a/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
+++ b/test/ck_tile/smoothquant/instances/smoothquant_fp16_n1024_instance.cpp
@@ -5,14 +5,6 @@
 
 // clang-format off
 //                                                  rm rn tm    tn  vn   pd    2p
-#if 0
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  2,  4,  64, 8,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  4,  4,  64, 4,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  8,  4,  64, 2,  true ,false>>(const S&, A);
-template float smoothquant_<trait_<ck_tile::fp16_t, 1, 16,  4,  64, 1,  true ,false>>(const S&, A);
-
-template float smoothquant_<trait_<ck_tile::fp16_t, 1,  1,  1, 256, 4,  true ,false>>(const S&, A);
-#endif
 
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 1, 2,  128, 8,  true, false>>(const S&, A);
 template float smoothquant_<trait_<ck_tile::fp16_t,  1, 2, 2,  128, 4,  true, false>>(const S&, A);