[rocm-libraries] ROCm/rocm-libraries#6302 (commit 8d419e8)

CK: Remove 41 commented-out dead code blocks (~200 lines)
 (#6302)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Depends on #6300

## Summary

Remove 41 commented-out code blocks across 33 files in Composable
Kernel, totaling ~200 lines.

Identified using an automated dead code scanning skill (`ck-dead-code`)
with a calibrated two-stage pipeline:
1. **Pre-filter**: Keyword-based scan found 1,338 `//`-commented blocks.
Calibrated heuristics (trained on 50-sample expert classification)
reduced to 89 high-confidence candidates — 93% noise reduction.
2. **Expert triage**: LLM expert classified each block in context as
CODE_REMOVE, CODE_KEEP, or NOT_CODE.

| Classification | Count |
|---------------|-------|
| Removed (this PR) | 41 |
| Kept (debug helpers, alt configs, reference impls) | 32 |
| Not code (false positives) | 16 |

Removed blocks include: superseded implementations, old test data,
abandoned stubs, unreachable code, and buggy dead code.
This commit is contained in:
Aviral Goel
2026-04-10 15:18:02 +00:00
committed by assistant-librarian[bot]
parent 4d0bbe5d17
commit e0dfe58d66
82 changed files with 22 additions and 2883 deletions

View File

@@ -456,9 +456,6 @@ struct MoeSortingKernel
template <typename T, typename F, index_t wave_size_ = get_warp_size()>
__device__ static constexpr T wave_reduce(T local, F reduce_f, number<wave_size_> = {})
{
// constexpr int wave_size = 64;
// constexpr int reduce_stage = 6; // 1<<6=64
// clang-format off
constexpr int reduce_stage = [](){
if constexpr(wave_size_ == 2) return 1;
else if constexpr(wave_size_ == 4) return 2;
@@ -1206,17 +1203,21 @@ CK_TILE_HOST_DEVICE index_t moe_sorting_mp_sem_smem_size()
template <typename T, typename F, index_t wave_size_ = get_warp_size()>
CK_TILE_DEVICE constexpr T moe_sorting_wave_reduce(T local, F reduce_f, number<wave_size_> = {})
{
// constexpr int wave_size = 64;
// constexpr int reduce_stage = 6; // 1<<6=64
// clang-format off
constexpr int reduce_stage = [](){
if constexpr(wave_size_ == 2) return 1;
else if constexpr(wave_size_ == 4) return 2;
else if constexpr(wave_size_ == 8) return 3;
else if constexpr(wave_size_ == 16) return 4;
else if constexpr(wave_size_ == 32) return 5;
else if constexpr(wave_size_ == 64) return 6;
else return 0;
constexpr int reduce_stage = []() {
if constexpr(wave_size_ == 2)
return 1;
else if constexpr(wave_size_ == 4)
return 2;
else if constexpr(wave_size_ == 8)
return 3;
else if constexpr(wave_size_ == 16)
return 4;
else if constexpr(wave_size_ == 32)
return 5;
else if constexpr(wave_size_ == 64)
return 6;
else
return 0;
}();
// clang-format on
T v_local = local;
@@ -3047,53 +3048,6 @@ struct MoeSortingMultiPhaseKernel_P23
x_r = x_v;
#endif
{
#if 0
#pragma unroll
for(int j = 0; j < index_pack / 2; j++)
{
int i_token = i * kBlockSize * index_pack + threadIdx.x + j * kBlockSize;
index_t x = x_d[j];
int i_topk = x - 1; // topk of this token
int i_show = x != 0 ? 1 : 0; // has this token or not
int cumsum = i_show;
impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
__syncthreads();
if(lane_id == get_warp_size() - 1)
{
s[4 + wave_id] = cumsum;
}
__syncthreads();
// reduce cross wave
static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
IndexType prev = s[4 + i_w];
prev = wave_id > i_w ? prev : 0; // mask out
cumsum += prev;
});
cumsum += prev_cumsum; // add previous round cumsum
if(threadIdx.x == kBlockSize - 1)
{
s[0] = cumsum;
}
__syncthreads();
int position = cumsum - i_show;
prev_cumsum = s[0]; // update the last cumsum
if(i_show)
{
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
p_sorted_token_ids[e_start + position] =
MOE_SORTING_MOCK_ID(i_token, i_topk);
#else
p_sorted_token_ids[e_start + position] = i_token;
#endif
p_sorted_weights[e_start + position] =
p_weights[i_token * kargs.topk_mdiv.divisor + i_topk];
}
}
#endif
{
d_t i_topk;
d_t i_show;
@@ -3151,68 +3105,6 @@ struct MoeSortingMultiPhaseKernel_P23
}
position += i_show[j];
});
#if 0
int i_token = i * kBlockSize * index_pack + threadIdx.x * 2 + j * kBlockSize * 2;
index_t x = x_d[j];
index_t x0 = static_cast<index_t>(x & 0xffff);
index_t x1 = static_cast<index_t>(x >> 16);
int i_topk_0 = x0 - 1; // topk of this token
int i_show_0 = x0 != 0 ? 1 : 0; // has this token or not
int i_topk_1 = x1 - 1; // topk of this token
int i_show_1 = x1 != 0 ? 1 : 0; // has this token or not
int cumsum = i_show_0 + i_show_1;
impl::moe_sorting_wave_cumsum<int, get_warp_size()>(cumsum);
__syncthreads();
if(lane_id == get_warp_size() - 1)
{
s[4 + wave_id] = cumsum;
}
__syncthreads();
// reduce cross wave
static_for<0, kBlockSize / get_warp_size() - 1, 1>{}([&](auto i_w) {
IndexType prev = s[4 + i_w];
prev = wave_id > i_w ? prev : 0; // mask out
cumsum += prev;
});
cumsum += prev_cumsum; // add previous round cumsum
if(threadIdx.x == kBlockSize - 1)
{
s[0] = cumsum;
}
__syncthreads();
int position_0 = cumsum - i_show_0 - i_show_1;
prev_cumsum = s[0]; // update the last cumsum
if(i_show_0)
{
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
p_sorted_token_ids[e_start + position_0] =
MOE_SORTING_MOCK_ID(i_token, i_topk_0);
#else
p_sorted_token_ids[e_start + position_0] = i_token;
#endif
p_sorted_weights[e_start + position_0] =
p_weights[i_token * kargs.topk_mdiv.divisor + i_topk_0];
}
int position_1 = cumsum - i_show_1;
if(i_show_1)
{
#if CK_TILE_REFERENCE_MOE_SORTING_MOCK_ID
p_sorted_token_ids[e_start + position_1] =
MOE_SORTING_MOCK_ID(i_token + 1, i_topk_1);
#else
p_sorted_token_ids[e_start + position_1] = i_token + 1;
#endif
p_sorted_weights[e_start + position_1] =
p_weights[(i_token + 1) * kargs.topk_mdiv.divisor + i_topk_1];
}
#endif
}
}
}

View File

@@ -14,14 +14,6 @@
namespace ck_tile {
// template <typename Problem_, typename Policy_ = MoeSortingPolicy>
// struct MoeSortingPipeline
// {
// // TODO: this kernel only support warp per row
// using Problem = remove_cvref_t<Problem_>;
// using Policy = remove_cvref_t<Policy_>;
// using WeightType = typename Problem::WeightType;
// template <typename TopkIdWindow, typename WeightWindow>
// CK_TILE_DEVICE auto operator()(const TopkIdWindow& topk_id_window,
// const WeightWindow& weight_window,