[rocm-libraries] ROCm/rocm-libraries#6302 (commit 8d419e8)

CK: Remove 41 commented-out dead code blocks (~200 lines) (#6302) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Depends on #6300 ## Summary Remove 41 commented-out code blocks across 33 files in Composable Kernel, totaling ~200 lines. Identified using an automated dead code scanning skill (`ck-dead-code`) with a calibrated two-stage pipeline: 1. **Pre-filter**: Keyword-based scan found 1,338 `//`-commented blocks. Calibrated heuristics (trained on 50-sample expert classification) reduced to 89 high-confidence candidates — 93% noise reduction. 2. **Expert triage**: LLM expert classified each block in context as CODE_REMOVE, CODE_KEEP, or NOT_CODE. | Classification | Count | |---------------|-------| | Removed (this PR) | 41 | | Kept (debug helpers, alt configs, reference impls) | 32 | | Not code (false positives) | 16 | Removed blocks include: superseded implementations, old test data, abandoned stubs, unreachable code, and buggy dead code.
2026-05-04 13:41:24 +00:00 · 2026-04-10 15:18:02 +00:00
parent 4d0bbe5d17
commit e0dfe58d66
82 changed files with 22 additions and 2883 deletions
--- a/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+++ b/include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
@@ -456,9 +456,6 @@ struct MoeSortingKernel
    template <typename T, typename F, index_t wave_size_ = get_warp_size()>
    __device__ static constexpr T wave_reduce(T local, F reduce_f, number<wave_size_> = {})
    {
-        // constexpr int wave_size = 64;
-        // constexpr int reduce_stage = 6; // 1<<6=64
-        // clang-format off
        constexpr int reduce_stage = [](){
            if constexpr(wave_size_ == 2) return 1;
            else if constexpr(wave_size_ == 4) return 2;
@@ -1206,17 +1203,21 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size()
 template <typename T, typename F, index_t wave_size_ = get_warp_size()>
 CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
 {
-    // constexpr int wave_size = 64;
-    // constexpr int reduce_stage = 6; // 1<<6=64
-    // clang-format off
-    constexpr int reduce_stage = [](){
-        if constexpr(wave_size_ == 2) return 1;
-        else if constexpr(wave_size_ == 4) return 2;
-        else if constexpr(wave_size_ == 8) return 3;
-        else if constexpr(wave_size_ == 16) return 4;
-        else if constexpr(wave_size_ == 32) return 5;
-        else if constexpr(wave_size_ == 64) return 6;
-        else return 0;
+    constexpr int reduce_stage = []() {
+        if constexpr(wave_size_ == 2)
+            return 1;
+        else if constexpr(wave_size_ == 4)
+            return 2;
+        else if constexpr(wave_size_ == 8)
+            return 3;
+        else if constexpr(wave_size_ == 16)
+            return 4;
+        else if constexpr(wave_size_ == 32)
+            return 5;
+        else if constexpr(wave_size_ == 64)
+            return 6;
+        else
+            return 0;
    }();
    // clang-format on
    T v_local = local;
@@ -3047,53 +3048,6 @@ struct MoeSortingMultiPhaseKernel_P23
                x_r = x_v;
 #endif
                {
-#if 0
-#pragma unroll
-                    for(int j = 0; j < index_pack / 2; j++)
-                    {
-                        int i_token = i * kBlockSize * index_pack + threadIdx.x + j * kBlockSize;
-                        index_t x   = x_d[j];
-                        int i_topk  = x - 1;          // topk of this token
-                        int i_show  = x != 0 ? 1 : 0; // has this token or not
-                        int cumsum  = i_show;
-                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
-
-                        __syncthreads();
-                        if(lane_id == get_warp_size() - 1)
-                        {
-                            s[4 + wave_id] = cumsum;
-                        }
-                        __syncthreads();
-
-                        // reduce cross wave
-                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
-                            IndexType prev = s[4 + i_w];
-                            prev           = wave_id > i_w ? prev : 0; // mask out
-                            cumsum += prev;
-                        });
-                        cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == kBlockSize - 1)
-                        {
-                            s[0] = cumsum;
-                        }
-                        __syncthreads();
-
-                        int position = cumsum - i_show;
-                        prev_cumsum  = s[0]; // update the last cumsum
-
-                        if(i_show)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position] =
-                                MOE_SORTING_MOCK_ID(i_token, i_topk);
-#else
-                            p_sorted_token_ids[e_start + position] = i_token;
-#endif
-                            p_sorted_weights[e_start + position] =
-                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk];
-                        }
-                    }
-#endif
                    {
                        d_t i_topk;
                        d_t i_show;
@@ -3151,68 +3105,6 @@ struct MoeSortingMultiPhaseKernel_P23
                            }
                            position += i_show[j];
                        });
-
-#if 0
-                        int i_token = i * kBlockSize * index_pack + threadIdx.x * 2 + j * kBlockSize * 2;
-                        index_t x   = x_d[j];
-                        index_t x0  = static_cast<index_t>(x & 0xffff);
-                        index_t x1  = static_cast<index_t>(x >> 16);
-                        int i_topk_0  = x0 - 1;          // topk of this token
-                        int i_show_0  = x0 != 0 ? 1 : 0; // has this token or not
-                        int i_topk_1  = x1 - 1;          // topk of this token
-                        int i_show_1  = x1 != 0 ? 1 : 0; // has this token or not
-                        int cumsum  = i_show_0 + i_show_1;
-                        impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
-
-                        __syncthreads();
-                        if(lane_id == get_warp_size() - 1)
-                        {
-                            s[4 + wave_id] = cumsum;
-                        }
-                        __syncthreads();
-
-                        // reduce cross wave
-                        static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
-                            IndexType prev = s[4 + i_w];
-                            prev           = wave_id > i_w ? prev : 0; // mask out
-                            cumsum += prev;
-                        });
-                        cumsum += prev_cumsum; // add previous round cumsum
-                        if(threadIdx.x == kBlockSize - 1)
-                        {
-                            s[0] = cumsum;
-                        }
-                        __syncthreads();
-
-                        int position_0 = cumsum - i_show_0 - i_show_1;
-                        prev_cumsum  = s[0]; // update the last cumsum
-
-                        if(i_show_0)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position_0] =
-                                MOE_SORTING_MOCK_ID(i_token, i_topk_0);
-#else
-                            p_sorted_token_ids[e_start + position_0] = i_token;
-#endif
-                            p_sorted_weights[e_start + position_0] =
-                                p_weights[i_token * kargs.topk_mdiv.divisor + i_topk_0];
-                        }
-
-                        int position_1 = cumsum - i_show_1;
-
-                        if(i_show_1)
-                        {
-#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
-                            p_sorted_token_ids[e_start + position_1] =
-                                MOE_SORTING_MOCK_ID(i_token + 1, i_topk_1);
-#else
-                            p_sorted_token_ids[e_start + position_1] = i_token + 1;
-#endif
-                            p_sorted_weights[e_start + position_1] =
-                                p_weights[(i_token + 1) * kargs.topk_mdiv.divisor + i_topk_1];
-                        }
-#endif
                    }
                }
            }
--- a/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
+++ b/include/ck_tile/ops/fused_moe/pipeline/moe_sorting_pipeline.hpp
@@ -14,14 +14,6 @@

 namespace ck_tile {

-// template <typename Problem_, typename Policy_ = MoeSortingPolicy>
-// struct MoeSortingPipeline
-// {
-//     // TODO: this kernel only support warp per row
-//     using Problem    = remove_cvref_t<Problem_>;
-//     using Policy     = remove_cvref_t<Policy_>;
-//     using WeightType = typename Problem::WeightType;
-
 //     template <typename TopkIdWindow, typename WeightWindow>
 //     CK_TILE_DEVICE auto operator()(const TopkIdWindow& topk_id_window,
 //                                    const WeightWindow& weight_window,