[CK_TILE] moe sorting ex kernel to support expert > 128 (#1840)

* moe sorting ex

* fix bug for race condition

* fix bug and optimze large expert

* fix

* optimize with sub_token_oneshot

* support skip empty tokens for expert sorting

* update moe_sorting

* tidy code
This commit is contained in:
carlushuang
2025-02-11 17:49:17 +08:00
committed by GitHub
parent 2312f4aaf9
commit c0adab4850
12 changed files with 936 additions and 99 deletions

View File

@@ -1,28 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include <string>
#include <type_traits>
namespace ck_tile {
template <typename IndexType_,
typename WeightType_,
index_t InternalLoadUnroll_,
index_t ExpertTile_ = 0>
struct MoeSortingProblem
{
// TODO: this kernel only support warp per row
using WeightType = remove_cvref_t<WeightType_>;
using IndexType = remove_cvref_t<IndexType_>;
static constexpr index_t WarpSize = get_warp_size();
static constexpr index_t WarpsPerBlock = 1;
static constexpr index_t InternalLoadUnroll =
InternalLoadUnroll_; // TODO: need better design(like tile size)
static constexpr index_t ExpertTile = ExpertTile_; // TODO: only used in store out
};
} // namespace ck_tile