[rocm-libraries] ROCm/rocm-libraries#6302 (commit 8d419e8)

CK: Remove 41 commented-out dead code blocks (~200 lines) (#6302) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depends on #6300 ## Summary Remove 41 commented-out code blocks across 33 files in Composable Kernel, totaling ~200 lines. Identified using an automated dead code scanning skill (`ck-dead-code`) with a calibrated two-stage pipeline: 1. **Pre-filter**: Keyword-based scan found 1,338 `//`-commented blocks. Calibrated heuristics (trained on 50-sample expert classification) reduced to 89 high-confidence candidates — 93% noise reduction. 2. **Expert triage**: LLM expert classified each block in context as CODE_REMOVE, CODE_KEEP, or NOT_CODE. | Classification | Count | |---------------|-------| | Removed (this PR) | 41 | | Kept (debug helpers, alt configs, reference impls) | 32 | | Not code (false positives) | 16 | Removed blocks include: superseded implementations, old test data, abandoned stubs, unreachable code, and buggy dead code.
2026-05-04 21:51:28 +00:00 · 2026-04-10 15:18:02 +00:00
parent 4d0bbe5d17
commit e0dfe58d66
82 changed files with 22 additions and 2883 deletions
--- a/include/ck_tile/core/arch/amd_buffer_addressing.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing.hpp
@@ -2166,27 +2166,11 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
        }
        else if constexpr(N == 8)
        {
-#if 0
-            thread_buffer<fp16_t, 8> tmp{src_thread_data};
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
-                                                static_cast<index_t>(coherence));
-#else
            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                static_cast<index_t>(coherence));
-#endif
        }
    }
    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
--- a/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
+++ b/include/ck_tile/core/arch/amd_buffer_addressing_builtins.hpp
@@ -1992,27 +1992,11 @@ CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer<T, N> src_thread_d
        }
        else if constexpr(N == 8)
        {
-#if 0
-            thread_buffer<fp16_t, 8> tmp{src_thread_data};
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<0>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset,
-                                                static_cast<index_t>(coherence));
-
-            llvm_amdgcn_raw_buffer_store_fp16x4(tmp.template get_as<fp16x4_t>()[number<1>{}],
-                                                dst_wave_buffer_resource,
-                                                dst_thread_addr_offset,
-                                                dst_wave_addr_offset + 4 * sizeof(fp16_t),
-                                                static_cast<index_t>(coherence));
-#else
            llvm_amdgcn_raw_buffer_store_fp32x4(bit_cast<fp32x4_t>(src_thread_data),
                                                dst_wave_buffer_resource,
                                                dst_thread_addr_offset,
                                                dst_wave_addr_offset,
                                                static_cast<index_t>(coherence));
-#endif
        }
    }
    else if constexpr(std::is_same<T, bf16_t>::value) // bf16
--- a/include/ck_tile/core/container/array.hpp
+++ b/include/ck_tile/core/container/array.hpp
@@ -84,19 +84,6 @@ struct array
            data[i] = static_cast<value_type>(c);
    }

-    // template <typename Y>
-    // CK_TILE_HOST_DEVICE constexpr array(const array& o)
-    // {
-    //     // static_assert(ArrayType::size() == size(), "wrong! size not the same");
-    //     __content = o.__content;
-    // }
-    // CK_TILE_HOST_DEVICE constexpr array& operator=(const array& o)
-    // {
-    //     // static_assert(ArrayType::size() == size(), "wrong! size not the same");
-    //     __content = o.__content;
-    //     return *this;
-    // }
-
    CK_TILE_HOST_DEVICE static constexpr auto size() { return N; }
    CK_TILE_HOST_DEVICE static constexpr bool is_static() { return is_static_v<value_type>; }

@@ -247,13 +234,6 @@ CK_TILE_HOST_DEVICE constexpr details::return_type<D, Ts...> make_array(Ts&&...
    return {std::forward<Ts>(ts)...};
 }

-// // make empty array
-// template <typename T>
-// CK_TILE_HOST_DEVICE constexpr auto make_array()
-// {
-//     return array<T, 0>{};
-// }
-
 // compatible with old ck's initializer, make an array and fill it withe the last element from
 // initializer_list
 template <typename T, index_t Size>
--- a/include/ck_tile/core/container/sequence.hpp
+++ b/include/ck_tile/core/container/sequence.hpp
@@ -480,32 +480,6 @@ struct sequence_split
    using right_type = decltype(Seq::extract(range1{}));
 };

-#if 0
-// reverse sequence
-template <typename Seq>
-struct sequence_reverse
-{
-    static constexpr index_t NSize = Seq{}.size();
-
-    using seq_split = sequence_split<Seq, NSize / 2>;
-    using type      = typename sequence_merge<
-        typename sequence_reverse<typename seq_split::right_type>::type,
-        typename sequence_reverse<typename seq_split::left_type>::type>::type;
-};
-
-template <index_t I>
-struct sequence_reverse<sequence<I>>
-{
-    using type = sequence<I>;
-};
-
-template <index_t I0, index_t I1>
-struct sequence_reverse<sequence<I0, I1>>
-{
-    using type = sequence<I1, I0>;
-};
-#endif
-
 namespace detail {
 template <typename Id, index_t... Ns>
 struct seq_reverse;
--- a/include/ck_tile/core/container/statically_indexed_array.hpp
+++ b/include/ck_tile/core/container/statically_indexed_array.hpp
@@ -24,18 +24,4 @@ using statically_indexed_array = array<T, N>;
 #endif

 // consider always use ck_tile::array for this purpose
-#if 0
-template <typename X, typename... Xs>
-CK_TILE_HOST_DEVICE constexpr auto make_statically_indexed_array(const X& x, const Xs&... xs)
-{
-    return statically_indexed_array<X, sizeof...(Xs) + 1>(x, static_cast<X>(xs)...);
-}
-
-// make empty statically_indexed_array
-template <typename X>
-CK_TILE_HOST_DEVICE constexpr auto make_statically_indexed_array()
-{
-    return statically_indexed_array<X, 0>();
-}
-#endif
 } // namespace ck_tile
--- a/include/ck_tile/core/container/thread_buffer.hpp
+++ b/include/ck_tile/core/container/thread_buffer.hpp
@@ -23,18 +23,6 @@ CK_TILE_HOST_DEVICE constexpr auto make_thread_buffer(Ts&&... ts)
 }
 #else

-#if 0
-template <typename T, index_t N>
-using thread_buffer = array<T, N>;
-
-template <typename... Ts>
-CK_TILE_HOST_DEVICE constexpr auto make_thread_buffer(Ts&&... ts)
-{
-    return make_array(ts...);
-}
-
-#endif
-
 // clang-format off
 template<typename T_, index_t N_>
 struct thread_buffer {
@@ -103,25 +91,6 @@ struct thread_buffer {
        return vx.data;
    }

-#if 0
-    template <typename X_,
-              index_t Is,
-              typename std::enable_if<has_same_scalar_type<value_type, X_>::value, bool>::type = false>
-    CK_TILE_HOST_DEVICE constexpr void _set_as(number<Is> is, X_ x)
-    {
-        using X = remove_cvref_t<X_>;
-
-        constexpr index_t kSPerX = vector_traits<X>::vector_size;
-
-        union {
-            X_ data;
-            tuple_array<value_type, kSPerX> sub_data;
-        } vx {x};
-
-        static_for<0, kSPerX, 1>{}(
-           [&](auto j) { operator()((is * number<sizeof(X_)/sizeof(value_type)>{}) + j) = vx.sub_data[j]; });
-    }
-#endif


 #define TB_COMMON_AS() \
--- a/include/ck_tile/core/container/tuple.hpp
+++ b/include/ck_tile/core/container/tuple.hpp
@@ -292,9 +292,6 @@ struct tuple : impl::tuple_base<make_index_sequence<sizeof...(T)>, T...>
    // below function should be used under tuple_array<> type, no extra check will perform here
    template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as()                            { return reinterpret_cast<tuple_array<Tx, size()>&>(*this); }
    template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as() const                      { return reinterpret_cast<const tuple_array<Tx, size()>&>(*this); }
-    // below index is for index *AFTER* type convert, not before
-    //template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(index_t i)                   { TP_COM_(); return reinterpret_cast<tuple_array<Tx, size()>&>(*this).at(i); }
-    //template <typename Tx> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(index_t i) const             { TP_COM_(); return reinterpret_cast<const tuple_array<Tx, size()>&>(*this).at(i); }
    template <typename Tx, index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(number<I>)        { TP_COM_(); return reinterpret_cast<tuple_array<Tx, size()>&>(*this).at(number<I>{}); }
    template <typename Tx, index_t I> CK_TILE_HOST_DEVICE constexpr decltype(auto) get_as(number<I>) const  { TP_COM_(); return reinterpret_cast<const tuple_array<Tx, size()>&>(*this).at(number<I>{}); }

@@ -333,13 +330,6 @@ struct vector_traits<tuple<T...>, void>
    static constexpr index_t vector_size = sizeof...(T);
 };

-// template <class... T>
-// CK_TILE_HOST_DEVICE constexpr
-// tuple<T...>
-// make_tuple(T const&... t)
-// {
-//     return {t...};
-// }
 template <typename... Xs>
 CK_TILE_HOST_DEVICE constexpr bool operator==(const tuple<Xs...>& a, const tuple<Xs...>& b)
 {
--- a/include/ck_tile/core/numeric/half.hpp
+++ b/include/ck_tile/core/numeric/half.hpp
@@ -264,93 +264,6 @@ bool operator>(const half_t& x, const half_t& y) { return __hgt(x.to_fp16(), y.t
 CK_TILE_DEVICE
 bool operator>=(const half_t& x, const half_t& y) { return __hge(x.to_fp16(), y.to_fp16()); }

-#if 0
-CK_TILE_DEVICE
-half_t operator+(const half_t& x, const half_t& y)
-{
-    return half_t(__hadd(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t operator-(const half_t& x) { return half_t(__hneg(x.to_fp16())); }
-
-CK_TILE_DEVICE
-half_t operator-(const half_t& x, const half_t& y)
-{
-    return half_t(__hsub(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t operator*(const half_t& x, const half_t& y)
-{
-    return half_t(__hmul(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t operator/(const half_t& x, const half_t& y)
-{
-    return half_t(__hdiv(x.to_fp16(), y.to_fp16()));
-}
-
-CK_TILE_DEVICE
-half_t& operator+=(half_t& x, const half_t& y)
-{
-    x = half_t(__hadd(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator-=(half_t& x, const half_t& y)
-{
-    x = half_t(__hsub(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator*=(half_t& x, const half_t& y)
-{
-    x = half_t(__hmul(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator/=(half_t& x, const half_t& y)
-{
-    x = half_t(__hdiv(x.to_fp16(), y.to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator++(half_t& x)
-{
-    x = half_t(__hadd(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t& operator--(half_t& x)
-{
-    x = half_t(__hsub(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return x;
-}
-
-CK_TILE_DEVICE
-half_t operator++(half_t& x, int)
-{
-    half_t y(x);
-    x = half_t(__hadd(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return y;
-}
-
-CK_TILE_DEVICE
-half_t operator--(half_t& x, int)
-{
-    half_t y(x);
-    x = half_t(__hsub(x.to_fp16(), half_t(1.0f).to_fp16()));
-    return y;
-}
-#endif
-
 #if CK_TILE_USE_CUSTOM_DATA_TYPE
 CK_TILE_ARITHMETIC_USING_FLOAT(CK_TILE_HOST, half_t)
 #endif
--- a/include/ck_tile/core/numeric/int8.hpp
+++ b/include/ck_tile/core/numeric/int8.hpp
@@ -73,27 +73,6 @@ struct numeric<int8_t>
    CK_TILE_HOST_DEVICE static constexpr int8_t zero() { return 0; }
 };

-#if 0
-
-template <>
-struct numeric_traits<int8_t>
-{
-    static constexpr int exp            = 5;
-    static constexpr int mant           = 10;
-    static constexpr int bias           = 15;
-    static constexpr uint16_t nan_mask  = 0x7C00;
-    static constexpr uint16_t head_mask = 0xFC00;
-    static constexpr uint16_t mant_mask = 0x3FF;
-    static constexpr uint16_t exp_mask  = 0x1F;
-    static constexpr uint32_t Inf       = 0x7C00;
-    static constexpr uint32_t NegInf    = 0xFC00;
-    static constexpr uint32_t NaN       = 0x7C01;
-    static constexpr uint32_t Neg0      = 0x8000;
-    static constexpr int PackedSize           = 1;
-    using bitwise_type                  = uint16_t;
-};
-#endif
-
 CK_TILE_HOST_DEVICE
 constexpr float int8_to_float(const int8_t& x) { return static_cast<float>(x); }

--- a/include/ck_tile/core/tensor/sweep_tile.hpp
+++ b/include/ck_tile/core/tensor/sweep_tile.hpp
@@ -295,10 +295,6 @@ struct tile_sweeper
    F f;
 };

-// partial deduction is not allowed
-// template <typename T, typename F, typename U>
-// tile_sweeper(const F&, U = {})->tile_sweeper<T, F, U>;
-
 // deduction guide
 template <typename T,
          typename F,
--- a/include/ck_tile/core/tensor/tile_distribution.hpp
+++ b/include/ck_tile/core/tensor/tile_distribution.hpp
@@ -454,45 +454,6 @@ struct tile_distribution_detail

 } // namespace detail

-#if 0
-// this returns a constexpr tile_distribution
-template <typename StaticTileDistributionEncoding_>
-CK_TILE_HOST_DEVICE constexpr auto make_tile_distribution(StaticTileDistributionEncoding_)
-{
-    using DstrEncode = remove_cvref_t<StaticTileDistributionEncoding_>;
-
-    constexpr auto adaptor_impl =
-        detail::make_adaptor_encoding_for_tile_distribution(StaticTileDistributionEncoding_{});
-
-    constexpr auto ps_ys_to_xs_adaptor_impl          = adaptor_impl.template at<0>();
-    constexpr auto ys_to_d_adaptor_impl              = adaptor_impl.template at<1>();
-    constexpr index_t d_length                       = adaptor_impl.template at<2>();
-    constexpr auto rh_major_minor_to_hidden_ids_impl = adaptor_impl.template at<3>();
-
-    constexpr auto ps_ys_to_xs_adaptor =
-        CONSTRUCT_TENSOR_ADAPTOR_FROM_ENCODING(ps_ys_to_xs_adaptor_impl);
-
-    constexpr auto ys_to_d_adaptor = CONSTRUCT_TENSOR_ADAPTOR_FROM_ENCODING(ys_to_d_adaptor_impl);
-
-    constexpr auto ys_to_d_descriptor =
-        make_tensor_descriptor_from_adaptor(ys_to_d_adaptor, d_length);
-
-    //
-    constexpr index_t ndim_rh_major = DstrEncode::detail::ndim_rh_major_;
-    constexpr auto ndims_rhs_minor  = DstrEncode::detail::ndims_rhs_minor_;
-
-    constexpr auto rh_major_minor_to_hidden_ids =
-        TO_TUPLE_OF_SEQUENCE(rh_major_minor_to_hidden_ids_impl, ndim_rh_major, ndims_rhs_minor);
-
-    return tile_distribution<
-        remove_cvref_t<decltype(ps_ys_to_xs_adaptor)>,
-        remove_cvref_t<decltype(ys_to_d_descriptor)>,
-        remove_cvref_t<DstrEncode>,
-        detail::tile_distribution_detail<remove_cvref_t<decltype(rh_major_minor_to_hidden_ids)>>>{
-        ps_ys_to_xs_adaptor, ys_to_d_descriptor};
-}
-#endif
-
 // this returns a static tile_distribution
 template <typename StaticTileDistributionEncoding_>
 CK_TILE_HOST_DEVICE constexpr auto make_static_tile_distribution(StaticTileDistributionEncoding_)
--- a/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
+++ b/include/ck_tile/ops/elementwise/unary_element_wise_operation.hpp
@@ -745,14 +745,6 @@ struct PassThroughPack2
    template <typename Y, typename X>
    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const;

-#if 0
-    CK_TILE_HOST_DEVICE constexpr void operator()(ck_tile::fp16x2_t& y, const ck_tile::f8x2_t& x) const
-    {
-        auto t = type_convert<float2_t>(x);
-        y      = type_convert<fp16x2_t>(t);
-    }
-#endif
-
    CK_TILE_HOST_DEVICE constexpr void operator()(fp16x2_t& y, const pk_int4_t& x) const
    {
        uint8_t x_u8 = bit_cast<uint8_t>(x);
@@ -871,61 +863,6 @@ struct UnaryConvert
    }
 };

-#if 0
-struct ConvertBF16RTN
-{
-    // convert to bf16 using round to nearest (rtn)
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
-    {
-        // check Y datatype
-        static_assert(std::is_same_v<Y, ck_tile::bf16_t>, "Data type is not supported by this operation!");
-
-        // check X datatype
-        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
-                      "Data type is not supported by this operation!");
-
-        y = bf16_convert_rtn<Y>(x);
-    }
-};
-
-struct ConvertF8SR
-{
-    // convert to fp8 using stochastic rounding (SR)
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
-    {
-        // check Y datatype
-        static_assert(std::is_same_v<Y, ck_tile::fp8_t> || std::is_same_v<Y, ck_tile::bf8_t>,
-                      "Data type is not supported by this operation!");
-
-        // check X datatype
-        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
-                      "Data type is not supported by this operation!");
-
-        y = f8_convert_sr<Y>(x);
-    }
-};
-
-struct ConvertF8RNE
-{
-    // convert to fp8 using rounding to nearest even
-    template <typename Y, typename X>
-    CK_TILE_HOST_DEVICE void operator()(Y& y, const X& x) const
-    {
-        // check Y datatype
-        static_assert(std::is_same_v<Y, ck_tile::fp8_t> || std::is_same_v<Y, ck_tile::bf8_t>,
-                      "Data type is not supported by this operation!");
-
-        // check X datatype
-        static_assert(std::is_same_v<X, float> || std::is_same_v<X, ck_tile::fp16_t>,
-                      "Data type is not supported by this operation!");
-
-        y = f8_convert_rne<Y>(x);
-    }
-};
-#endif
-
 struct Scale
 {
    static constexpr const char* name = "Scale";
--- a/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/grouped_flatmm_kernel.hpp
@@ -339,16 +339,6 @@ struct GroupedFlatmmKernel : FlatmmKernel<TilePartitioner_, FlatmmPipeline_, Epi
    {
        return hostArgs;
    }
-    // CK_TILE_HOST static constexpr auto
-    // MakeKernelArgs(const ContiguousGroupedFlatmmHostArgs& hostArgs)
-    // {
-    //     return hostArgs;
-    // }
-    // CK_TILE_HOST static constexpr auto
-    // MakeKernelArgs(const MaskedGroupedFlatmmHostArgs& hostArgs)
-    // {
-    //     return hostArgs;
-    // }

    template <class ScaleM       = FlatmmScalePointer<-1>,
              class ScaleN       = FlatmmScalePointer<-1>,
--- a/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp
+++ b/include/ck_tile/ops/flatmm/kernel/moe_flatmm_kernel.hpp
@@ -483,13 +483,6 @@ struct MoeFlatmmKernel

        if constexpr(std::is_same_v<BLayout, tensor_layout::gemm::RowMajor>)
        {
-            // if(kargs.N % TilePartitioner::NPerBlock != 0 && FlatmmPipeline::kPadN == false)
-            // {
-            //     std::cerr << "Can't support N that is not a multiple of NPerBlock"
-            //                  " without padding!"
-            //               << std::endl;
-            //     return false;
-            // }
            if(kargs.N % FlatmmPipeline::GetVectorSizeB() != 0)
            {
                std::cerr << "N is not a multiple of vector load size for B tensor!" << std::endl;
--- a/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/flatmm_pipeline_agmem_bgmem_creg_v1_policy.hpp
@@ -392,10 +392,6 @@ struct UniversalFlatmmPipelineAgBgCrPolicy
        constexpr index_t M1 = BlockSize / get_warp_size();
        static_assert(M2 != 0, "M2 is zero, which will lead to a division by zero error.");
        static_assert(M1 != 0, "M1 is zero, which will lead to a division by zero error.");
-        // constexpr index_t M0 = MPerBlock / (M2 * M1);
-        // static_assert(M0 * M1 * M2 == MPerBlock,
-        //                 "Incorrect M0, M2, M1 configuration! "
-        //                 "M0, M1, M2 must cover whole MPerBlock!");

        return make_static_tile_distribution(
            tile_distribution_encoding<sequence<1>,
--- a/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mixed_prec_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -1151,11 +1151,6 @@ struct F16xMXF4FlatmmPipelineAGmemBGmemCRegV1
                            a_warp_tensor(number<AwarpIter>{}) =
                                load_tile(a_warp_windows_pong(number<AmIter>{})(number<AkIter>{}));
                        }
-                        // barrier
-                        // if constexpr((kIter == KIterPerWarp - 1) && (mIter == MIter_2nd_last))
-                        // {
-                        //     block_sync_lds();
-                        // }
                    });
                }
            });
@@ -1636,10 +1631,6 @@ struct F8xMXF4FlatmmPipelineAGmemBGmemCRegV1
                                    ? Aload_rep
                                    : 0;
                }
-                // if((kIter % KPerScaleLoad == 0) && (mIter == 0))
-                // {
-                //     load_perM = load_perM + 1;
-                // }
                SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
            }
        }
--- a/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/moe_flatmm_pipeline_agmem_bgmem_creg.hpp
@@ -103,13 +103,8 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
    static constexpr index_t Aload_num_perK = dswrite_num_perK;
    static constexpr index_t Aload_rep      = dswrite_rep;
    static constexpr index_t Bload_num_perK = kNPerBlock * WG::kK / NWarp / BK1 / WaveSize;
-    // static constexpr index_t ScaleBload_K1  = ContinuousScaleNPerThread *
-    // ContinuousScaleKPerThread; static constexpr index_t ScaleBload_num =
-    //     kNPerBlock * kKPerBlock / NWarp / 32 / ScaleBload_K1 /
-    //     WaveSize; // BlockN * BlockK / NWarp / ScalePerK / ScaleB_K1 / wavesize
-    // static constexpr index_t KPerScaleLoad = KIterPerWarp / ScaleBload_num;
-    static constexpr index_t HalfMIter = (MIterPerWarp + 1) / 2;
-    static constexpr index_t Bload_rep = (Bload_num_perK + HalfMIter - 1) / HalfMIter;
+    static constexpr index_t HalfMIter      = (MIterPerWarp + 1) / 2;
+    static constexpr index_t Bload_rep      = (Bload_num_perK + HalfMIter - 1) / HalfMIter;

    static constexpr index_t mfma_perM_perK = NIterPerWarp * mfma_per_wg;
    static constexpr index_t dswrite_mIter  = (DsWritePreIssue - 1) % MIterPerWarp;
@@ -352,10 +347,6 @@ struct MoeFlatmmPipelineAGmemBGmemCRegV1
                                    ? Aload_rep
                                    : 0;
                }
-                // if((kIter % KPerScaleLoad == 0) && (mIter == 0))
-                // {
-                //     load_perM = load_perM + 1;
-                // }
                SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
            }
        }
--- a/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
+++ b/include/ck_tile/ops/flatmm/pipeline/mx_flatmm_pipeline_agmem_bgmem_creg_v1.hpp
@@ -390,10 +390,6 @@ struct MXFlatmmPipelineAGmemBGmemCRegV1 : FlatmmPipelineAGmemBGmemCRegV1<Problem
                                    ? Aload_rep
                                    : 0;
                }
-                // if((kIter % KPerScaleLoad == 0) && (mIter == 0))
-                // {
-                //     load_perM = load_perM + 1;
-                // }
                SchedulerPerM(dsread_perM, dswrite_perM, load_perM);
            }
        }
--- a/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
+++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_pipeline_qx_ks_vs_custom_policy.hpp
@@ -692,9 +692,6 @@ struct BlockFmhaPipelineQXKSVSCustomPolicy : BlockFmhaPipelineQXCustomPolicy<QLo
        constexpr index_t LaneGroups = WarpSize / LanesPerK; // within a wave
        constexpr index_t NumIssues  = kNPerBlock / (LaneGroups * NumWarps);
        static_assert(NumIssues == kNPerBlock * kKPerBlock / (kBlockSize * KVector));
-        // constexpr index_t SingleKSize = NumIssues * NumWarps * (WarpSize * KVector + kPad);
-        // constexpr index_t SingleVSize =
-        // MakeVLdsBlockDescriptor<Problem>().get_element_space_size();
        constexpr index_t BufferSize =
            GetSingleSmemElementSpaceSize<Problem>(); //  max(SingleKSize, SingleVSize);

--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -456,9 +456,6 @@ struct MoeSortingKernel
    template <typename T, typename F, index_t wave_size_ = get_warp_size()>
    __device__ static constexpr T wave_reduce(T local, F reduce_f, number<wave_size_> = {})
    {
-        // constexpr int wave_size = 64;
-        // constexpr int reduce_stage = 6; // 1<<6=64
-        // clang-format off
        constexpr int reduce_stage = [](){
            if constexpr(wave_size_ == 2) return 1;
            else if constexpr(wave_size_ == 4) return 2;
@@ -1206,17 +1203,21 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size()
 template <typename T, typename F, index_t wave_size_ = get_warp_size()>
 CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
 {
-    // constexpr int wave_size = 64;
-    // constexpr int reduce_stage = 6; // 1<<6=64
-    // clang-format off
-    constexpr int reduce_stage = [](){
-        if constexpr(wave_size_ == 2) return 1;
-        else if constexpr(wave_size_ == 4) return 2;
-        else if constexpr(wave_size_ == 8) return 3;
-        else if constexpr(wave_size_ == 16) return 4;
-        else if constexpr(wave_size_ == 32) return 5;
-        else if constexpr(wave_size_ == 64) return 6;
-        else return 0;
+    constexpr int reduce_stage = []() {
+        if constexpr(wave_size_ == 2)
+            return 1;
+        else if constexpr(wave_size_ == 4)
+            return 2;
+        else if constexpr(wave_size_ == 8)
+            return 3;
+        else if constexpr(wave_size_ == 16)
+            return 4;
+        else if constexpr(wave_size_ == 32)
+            return 5;
+        else if constexpr(wave_size_ == 64)
+            return 6;
+        else
+            return 0;
    }();
    // clang-format on
    T v_local = local;
@@ -3047,53 +3048,6 @@ struct MoeSortingMultiPhaseKernel_P23
                x_r = x_v;
 #endif
                {
-#if 0
-#pragma unroll
-                    for(int j = 0; j < index_pack / 2; j++)
-                    {
-                        int i_token = i * kBlockSize * index_pack + threadIdx.x + j * kBlockSize;
-                        index_t x   = x_d[j];
-                        int i_topk  = x - 1;          // topk of this token
-                        int i_show  = x != 0 ? 1 : 0; // has this token or not
-                        int cumsum  = i_show;
-                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
-
-                        __syncthreads();
-                        if(lane_id == get_warp_size() - 1)
-                        {
-                            s[4 + wave_id] = cumsum;
-                        }
-                        __syncthreads();
-
-                        // reduce cross wave
-                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
-                            IndexType prev = s[4 + i_w];
-                            prev           = wave_id > i_w ? prev : 0; // mask out
-                            cumsum += prev;
-                        });
-                        cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == kBlockSize - 1)
-                        {
-                            s[0] = cumsum;
-                        }
-                        __syncthreads();
-
-                        int position = cumsum - i_show;
-                        prev_cumsum  = s[0]; // update the last cumsum
-
-                        if(i_show)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position] =
-                                MOE_SORTING_MOCK_ID(i_token, i_topk);
-#else
-                            p_sorted_token_ids[e_start + position] = i_token;
-#endif
-                            p_sorted_weights[e_start + position] =
-                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk];
-                        }
-                    }
-#endif
                    {
                        d_t i_topk;
                        d_t i_show;
@@ -3151,68 +3105,6 @@ struct MoeSortingMultiPhaseKernel_P23
                            }
                            position += i_show[j];
                        });
-
-#if 0
-                        int i_token = i * kBlockSize * index_pack + threadIdx.x * 2 + j * kBlockSize * 2;
-                        index_t x   = x_d[j];
-                        index_t x0  = static_cast<index_t>(x & 0xffff);
-                        index_t x1  = static_cast<index_t>(x >> 16);
-                        int i_topk_0  = x0 - 1;          // topk of this token
-                        int i_show_0  = x0 != 0 ? 1 : 0; // has this token or not
-                        int i_topk_1  = x1 - 1;          // topk of this token
-                        int i_show_1  = x1 != 0 ? 1 : 0; // has this token or not
-                        int cumsum  = i_show_0 + i_show_1;
-                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
-
-                        __syncthreads();
-                        if(lane_id == get_warp_size() - 1)
-                        {
-                            s[4 + wave_id] = cumsum;
-                        }
-                        __syncthreads();
-
-                        // reduce cross wave
-                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
-                            IndexType prev = s[4 + i_w];
-                            prev           = wave_id > i_w ? prev : 0; // mask out
-                            cumsum += prev;
-                        });
-                        cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == kBlockSize - 1)
-                        {
-                            s[0] = cumsum;
-                        }
-                        __syncthreads();
-
-                        int position_0 = cumsum - i_show_0 - i_show_1;
-                        prev_cumsum  = s[0]; // update the last cumsum
-
-                        if(i_show_0)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position_0] =
-                                MOE_SORTING_MOCK_ID(i_token, i_topk_0);
-#else
-                            p_sorted_token_ids[e_start + position_0] = i_token;
-#endif
-                            p_sorted_weights[e_start + position_0] =
-                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk_0];
-                        }
-
-                        int position_1 = cumsum - i_show_1;
-
-                        if(i_show_1)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position_1] =
-                                MOE_SORTING_MOCK_ID(i_token + 1, i_topk_1);
-#else
-                            p_sorted_token_ids[e_start + position_1] = i_token + 1;
-#endif
-                            p_sorted_weights[e_start + position_1] =
-                                p_weights[(i_token + 1) * kargs.topk_mdiv.divisor + i_topk_1];
-                        }
-#endif
                    }
                }
            }
--- a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
@@ -14,14 +14,6 @@

 namespace ck_tile {

-// template <typename Problem_, typename Policy_ = MoeSortingPolicy>
-// struct MoeSortingPipeline
-// {
-//     // TODO: this kernel only support warp per row
-//     using Problem    = remove_cvref_t<Problem_>;
-//     using Policy     = remove_cvref_t<Policy_>;
-//     using WeightType = typename Problem::WeightType;
-
 //     template <typename TopkIdWindow, typename WeightWindow>
 //     CK_TILE_DEVICE auto operator()(const TopkIdWindow& topk_id_window,
 //                                    const WeightWindow& weight_window,
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp
@@ -36,9 +36,6 @@ struct BlockGemmARegBSmemCRegOneWarpV1
                std::is_same_v<CDataType, remove_cv_t<typename CBlockTensor::DataType>>,
            "wrong!");

-        // constexpr index_t MPerBlock = ABlockTensorTmp{}.get_lengths()[number<0>{}];
-        // constexpr index_t NPerBlock = BBlockWindowTmp{}.get_window_lengths()[number<0>{}];
-        // constexpr index_t KPerBlock = ABlockTensorTmp{}.get_lengths()[number<1>{}];
        constexpr index_t MPerBlock = BlockGemmShape::kM;
        constexpr index_t NPerBlock = BlockGemmShape::kN;
        constexpr index_t KPerBlock = BlockGemmShape::kK;
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp
@@ -19,30 +19,7 @@ struct BlockGemmARegBSmemCRegV1DefaultPolicy
                     std::is_same_v<typename Problem::BDataType, half_t> &&
                     std::is_same_v<typename Problem::CDataType, float>)
        {
-#if 0
-            constexpr index_t kBlockSize = Problem::kBlockSize;
-
-            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-            constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-            static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
-
-            constexpr index_t NumWarp = kBlockSize / get_warp_size();
-
-            // FIXME
-            if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
-                         kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-            else
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-#else
            return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
-#endif
        }
        else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
                          std::is_same_v<typename Problem::BDataType, bf16_t> &&
--- a/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp
@@ -16,30 +16,7 @@ struct BlockGemmARegBSmemCRegV2DefaultPolicy
    CK_TILE_HOST_DEVICE static constexpr auto GetWarpGemmMWarpNWarp()
    {

-#if 0
-        constexpr index_t kBlockSize = Problem::kBlockSize;
-
-        constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-        constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-        constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-        static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
-
-        constexpr index_t NumWarp = kBlockSize / get_warp_size();
-
-        // FIXME
-        if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
-                     kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
-        {
-            return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-        }
-        else
-        {
-            return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-        }
-#else
        return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
-#endif
    }
 };

--- a/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
+++ b/include/ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp
@@ -19,30 +19,7 @@ struct BlockGemmASmemBRegCRegV1DefaultPolicy
                     std::is_same_v<typename Problem::BDataType, half_t> &&
                     std::is_same_v<typename Problem::CDataType, float>)
        {
-#if 0
-            constexpr index_t kBlockSize = Problem::kBlockSize;
-
-            constexpr index_t kMPerBlock = Problem::BlockGemmShape::kM;
-            constexpr index_t kNPerBlock = Problem::BlockGemmShape::kN;
-            constexpr index_t kKPerBlock = Problem::BlockGemmShape::kK;
-
-            static_assert(kBlockSize % get_warp_size() == 0, "wrong!");
-
-            constexpr index_t NumWarp = kBlockSize / get_warp_size();
-
-            // FIXME
-            if constexpr(NumWarp == 4 && kMPerBlock % 128 == 0 &&
-                         kNPerBlock % 128 == 0 % kKPerBlock % 16 == 0)
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-            else
-            {
-                return make_tuple(WarpGemmMfmaF16F16F32M32N32K8{}, 4, 1);
-            }
-#else
            return make_tuple(WarpGemmMfmaF16F16F32M32N32K8TransposedCDistribution{}, 4, 1);
-#endif
        }
        else if constexpr(std::is_same_v<typename Problem::ADataType, bf16_t> &&
                          std::is_same_v<typename Problem::BDataType, bf16_t> &&
--- a/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
+++ b/include/ck_tile/ops/norm_reduce/block/block_norm_reduce.hpp
@@ -120,10 +120,6 @@ struct BlockNormReduceSync

        constexpr index_t idim_p_lane = NDimP - 1;

-        // const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
-        // const auto rs_idx =
-        //     mean_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
-
        constexpr index_t thread_buf_size = MeanDistributedTensor_::get_thread_buffer_size();
        static_assert(thread_buf_size == VarDistributedTensor_::get_thread_buffer_size());

@@ -360,17 +356,6 @@ struct BlockNormReduceCrossWarpSync
 template <typename BlockShape>
 CK_TILE_DEVICE constexpr index_t block_tile_welford_calculate_max_count(int row_size)
 {
-#if 0
-    using S                   = BlockShape;
-    index_t LastloopN         = row_size % S::Block_N == 0 ? S::Block_N : row_size % S::Block_N;
-    constexpr index_t NThread = S::WarpPerBlock_N * S::ThreadPerWarp_N;
-    index_t iNLane            = get_thread_id() % NThread;
-    index_t iN0               = LastloopN / (S::Vector_N * S::ThreadPerWarp_N);
-    index_t iN1               = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) / S::Vector_N;
-    index_t N2                = (LastloopN % (S::Vector_N * S::ThreadPerWarp_N)) % S::Vector_N;
-    index_t iN3               = iNLane < iN1 ? S::Vector_N : iNLane == iN1 ? N2 : 0;
-    return iN0 * S::Vector_N + iN3;
-#endif
    using S_                            = BlockShape;
    constexpr index_t ThreadsPerBlock_N = S_::WarpPerBlock_N * S_::ThreadPerWarp_N;

--- a/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
+++ b/include/ck_tile/ops/reduce/block/block_reduce2d.hpp
@@ -140,28 +140,6 @@ struct BlockReduce2d
                                           ReducePacksPerXDim{});
    }

-#if 0
-        constexpr auto I0 = number<0>{};
-        constexpr auto I1 = number<1>{};
-        constexpr auto spans = XDistributedTensor_::get_distributed_spans();
-
-        // FIXME: hard coded to reduce 2nd axis
-        sweep_tile_span(spans[I0], [&](auto dstr_idx_i0) {
-            constexpr auto y_dstr_idx = make_tuple(dstr_idx_i0);
-
-            auto y = y_tensor[y_dstr_idx];
-
-            sweep_tile_span(spans[I1], [&](auto dstr_idx_i1) {
-                constexpr auto in_dstr_idx = make_tuple(dstr_idx_i0, dstr_idx_i1);
-                const auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
-
-                y = reduce_func(y, x);
-            });
-
-            y_tensor(y_dstr_idx) = y;
-        });
-#endif
-
    template <typename XDistributedTensor_>
    CK_TILE_DEVICE static auto MakeYBlockTile()
    {
@@ -240,10 +218,6 @@ struct BlockReduce2dSync

        constexpr index_t idim_p_lane = NDimP - 1;

-        // const auto ps_idx = make_array<index_t>(get_warp_id(), get_lane_id());
-        // const auto rs_idx =
-        //     y_tensor.get_tile_distribution().calculate_rs_index_from_ps_index(ps_idx);
-
        constexpr index_t thread_buf_size = YDistributedTensor_::get_thread_buffer_size();

        // loop over thread data