use int64_t as expert stride to avoid overflow

2026-05-04 05:31:24 +00:00 · 2025-08-21 06:58:55 +00:00
parent 9fbcc8f8a4
commit 85976b0b87
3 changed files with 19 additions and 18 deletions
--- a/include/ck_tile/host/reference/reference_moe_gemm.hpp
+++ b/include/ck_tile/host/reference/reference_moe_gemm.hpp
@@ -119,16 +119,16 @@ __global__ void moe_gemm_kernel(const ck_tile::index_t* p_sorted_token_ids_,
                              ? gather_token_id * strideA + k
                              : k * strideA + gather_token_id;

-            int b_index =
-                expert_id * N * K + ((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
-                                         ? col * strideB + k
-                                         : k * strideB + col);
-            int b_index_up;
+            long b_index =
+                long(expert_id) * N * K +
+                ((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>) ? col * strideB + k
+                                                                             : k * strideB + col);
+            long b_index_up;
            if constexpr(MoeGemmKind == 1)
-                b_index_up =
-                    expert_id * N * K + ((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
-                                             ? (col + problem_N) * strideB + k
-                                             : k * strideB + col + problem_N);
+                b_index_up = long(expert_id) * N * K +
+                             ((std::is_same_v<LayoutB, tensor_layout::gemm::ColumnMajor>)
+                                  ? (col + problem_N) * strideB + k
+                                  : k * strideB + col + problem_N);

            AccDataType v_a;
            AccDataType v_b;
--- a/include/ck_tile/ops/moe_flatmm/kernel/moe_flatmm_kernel.hpp
+++ b/include/ck_tile/ops/moe_flatmm/kernel/moe_flatmm_kernel.hpp
@@ -644,7 +644,8 @@ struct MoeFlatmmKernel
        });

        const SplitKBatchOffset splitk_batch_offset(kargs);
-        const index_t expert_stride = __builtin_amdgcn_readfirstlane(kargs.N * kargs.K);
+        const long_index_t expert_stride =
+            __builtin_amdgcn_readfirstlane(long_index_t(kargs.N) * kargs.K);

        const ADataType* a_ptr =
            static_cast<const ADataType*>(kargs.a_ptr) + splitk_batch_offset.a_k_split_offset;