mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 05:31:24 +00:00
[CK_TILE] Add Various Fusion Functions to RMSNorm (#1802)
* Add shortcut to RMSNorm * Modify test for adding shortcut for RMSNorm * Add fused parameter into tests * 1. Add YDataType. 2. rmsnorm2d_fwd_traits_ from rmsnorm2d_fwd.hpp to rmsnorm2d_fwd_api.cpp and rmsnorm2d_fwd_instance_common.hpp * 1. Supports various stride and percisions. * Add support of Epilogue * Add fuse and epilogue support to rmsnorm ref * Modify rmsnorm example * Refactor tests/examples * Bug fix for newly added tests/examples * Bug fix for new tests 2 * Modify smoke test scripts remove dbg code * Supports non-smooth dyanmic quant * Update Rmsnorm2dFwd::GetName() * rename xscale and prec_sx to smoothscale and prec_sm Bug fix after rename Remove files * change example_rmsnorm2d_fwd.cpp * update performance calculator * Fix issue in two-pass when fuse add is enabled * Remove comment of beta --------- Co-authored-by: rocking <ChunYu.Lai@amd.com>
This commit is contained in:
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -12,7 +12,7 @@ namespace ck_tile {
|
||||
struct MoeSmoothquantHostArgs
|
||||
{
|
||||
const void* p_x; // [tokens ,hidden_size], input, fp16/bf16
|
||||
const void* p_xscale; // [experts, hidden_size], input, columnwise scale, fp32
|
||||
const void* p_smscale; // [experts, hidden_size], input, columnwise scale, fp32
|
||||
const void* p_topk_ids; // [tokens, topk]
|
||||
|
||||
void* p_yscale; // [topk * tokens, 1], output, rowwise quant scale
|
||||
@@ -33,11 +33,11 @@ struct MoeSmoothquant
|
||||
using Pipeline = remove_cvref_t<Pipeline_>;
|
||||
using Problem = typename Pipeline::Problem;
|
||||
|
||||
using XDataType = remove_cvref_t<typename Problem::XDataType>;
|
||||
using XScaleDataType = remove_cvref_t<typename Problem::XScaleDataType>;
|
||||
using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
|
||||
using YScaleDataType = remove_cvref_t<typename Problem::YScaleDataType>;
|
||||
using QYDataType = remove_cvref_t<typename Problem::QYDataType>;
|
||||
using XDataType = remove_cvref_t<typename Problem::XDataType>;
|
||||
using SmoothScaleDataType = remove_cvref_t<typename Problem::SmoothScaleDataType>;
|
||||
using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
|
||||
using YScaleDataType = remove_cvref_t<typename Problem::YScaleDataType>;
|
||||
using QYDataType = remove_cvref_t<typename Problem::QYDataType>;
|
||||
|
||||
static constexpr index_t Block_M = Problem::BlockShape::Block_M;
|
||||
static constexpr index_t Block_N = Problem::BlockShape::Block_N;
|
||||
@@ -57,7 +57,7 @@ struct MoeSmoothquant
|
||||
struct Kargs
|
||||
{
|
||||
const void* p_x; // [tokens ,hidden_size], input, fp16/bf16
|
||||
const void* p_xscale; // [experts, hidden_size], input, columnwise scale, fp32
|
||||
const void* p_smscale; // [experts, hidden_size], input, columnwise scale, fp32
|
||||
const void* p_topk_ids; // [tokens, topk]
|
||||
|
||||
void* p_yscale; // [topk, tokens, 1], output, rowwise quant scale
|
||||
@@ -75,7 +75,7 @@ struct MoeSmoothquant
|
||||
CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
|
||||
{
|
||||
return Kargs{hargs.p_x,
|
||||
hargs.p_xscale,
|
||||
hargs.p_smscale,
|
||||
hargs.p_topk_ids,
|
||||
hargs.p_yscale,
|
||||
hargs.p_qy,
|
||||
@@ -153,9 +153,10 @@ struct MoeSmoothquant
|
||||
}();
|
||||
|
||||
// [experts, hidden_size],
|
||||
const auto xscale_window = [&]() {
|
||||
const auto smscale_window = [&]() {
|
||||
const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
|
||||
static_cast<const XScaleDataType*>(kargs.p_xscale) + i_expert * kargs.hidden_size,
|
||||
static_cast<const SmoothScaleDataType*>(kargs.p_smscale) +
|
||||
i_expert * kargs.hidden_size,
|
||||
make_tuple(kargs.hidden_size),
|
||||
make_tuple(1),
|
||||
number<Vector_N>{},
|
||||
@@ -198,7 +199,7 @@ struct MoeSmoothquant
|
||||
|
||||
__shared__ char smem[GetSmemSize()];
|
||||
|
||||
Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.hidden_size, smem);
|
||||
Pipeline{}(x_window, smscale_window, yscale_window, qy_window, kargs.hidden_size, smem);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -11,11 +11,11 @@ namespace ck_tile {
|
||||
// host side args
|
||||
struct SmoothquantHostArgs
|
||||
{
|
||||
const void* p_x; // [m ,n], input, fp16/bf16
|
||||
const void* p_xscale; // [1, n], input, columnwise scale, fp32
|
||||
const void* p_x; // [m ,n], input, fp16/bf16
|
||||
const void* p_smscale; // [1, n], input, columnwise scale, fp32
|
||||
|
||||
void* p_yscale; // [m, 1], output, rowwise quant scale (amax / 127) of (p_x * p_xscale)
|
||||
void* p_qy; // [m, n], output, p_x * p_xscale / p_yscale
|
||||
void* p_yscale; // [m, 1], output, rowwise quant scale (amax / 127) of (p_x * p_smscale)
|
||||
void* p_qy; // [m, n], output, p_x * p_smscale / p_yscale
|
||||
|
||||
index_t m;
|
||||
index_t n;
|
||||
@@ -30,11 +30,11 @@ struct Smoothquant
|
||||
using Pipeline = remove_cvref_t<Pipeline_>;
|
||||
using Problem = typename Pipeline::Problem;
|
||||
|
||||
using XDataType = remove_cvref_t<typename Problem::XDataType>;
|
||||
using XScaleDataType = remove_cvref_t<typename Problem::XScaleDataType>;
|
||||
using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
|
||||
using YScaleDataType = remove_cvref_t<typename Problem::YScaleDataType>;
|
||||
using QYDataType = remove_cvref_t<typename Problem::QYDataType>;
|
||||
using XDataType = remove_cvref_t<typename Problem::XDataType>;
|
||||
using SmoothScaleDataType = remove_cvref_t<typename Problem::SmoothScaleDataType>;
|
||||
using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
|
||||
using YScaleDataType = remove_cvref_t<typename Problem::YScaleDataType>;
|
||||
using QYDataType = remove_cvref_t<typename Problem::QYDataType>;
|
||||
|
||||
static constexpr index_t Block_M = Problem::BlockShape::Block_M;
|
||||
static constexpr index_t Block_N = Problem::BlockShape::Block_N;
|
||||
@@ -52,7 +52,7 @@ struct Smoothquant
|
||||
struct Kargs
|
||||
{
|
||||
const void* p_x;
|
||||
const void* p_xscale;
|
||||
const void* p_smscale;
|
||||
|
||||
void* p_yscale;
|
||||
void* p_qy;
|
||||
@@ -67,7 +67,7 @@ struct Smoothquant
|
||||
CK_TILE_HOST static constexpr Kargs MakeKargs(const Hargs& hargs)
|
||||
{
|
||||
return Kargs{hargs.p_x,
|
||||
hargs.p_xscale,
|
||||
hargs.p_smscale,
|
||||
hargs.p_yscale,
|
||||
hargs.p_qy,
|
||||
hargs.m,
|
||||
@@ -134,9 +134,9 @@ struct Smoothquant
|
||||
tmp2_, make_tuple(number<Block_M>{}, number<Block_N>{}), {iM, 0});
|
||||
}();
|
||||
|
||||
const auto xscale_window = [&]() {
|
||||
const auto smscale_window = [&]() {
|
||||
const auto tmp_ = make_naive_tensor_view<address_space_enum::global>(
|
||||
static_cast<const XScaleDataType*>(kargs.p_xscale),
|
||||
static_cast<const SmoothScaleDataType*>(kargs.p_smscale),
|
||||
make_tuple(kargs.n),
|
||||
make_tuple(1),
|
||||
number<Vector_N>{},
|
||||
@@ -177,7 +177,7 @@ struct Smoothquant
|
||||
|
||||
__shared__ char smem[GetSmemSize()];
|
||||
|
||||
Pipeline{}(x_window, xscale_window, yscale_window, qy_window, kargs.n, smem);
|
||||
Pipeline{}(x_window, smscale_window, yscale_window, qy_window, kargs.n, smem);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -28,7 +28,7 @@ struct SmoothquantPipelineDefaultPolicy
|
||||
}
|
||||
|
||||
template <typename Problem>
|
||||
CK_TILE_DEVICE static constexpr auto MakeXScaleBlockTileDistribution()
|
||||
CK_TILE_DEVICE static constexpr auto MakeSmoothScaleBlockTileDistribution()
|
||||
{
|
||||
using S = typename Problem::BlockShape;
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -16,11 +16,11 @@ struct SmoothquantPipelineOnePass
|
||||
using Problem = ck_tile::remove_cvref_t<Problem_>;
|
||||
using Policy = ck_tile::remove_cvref_t<Policy_>;
|
||||
|
||||
using XDataType = ck_tile::remove_cvref_t<typename Problem::XDataType>;
|
||||
using XScaleDataType = ck_tile::remove_cvref_t<typename Problem::XScaleDataType>;
|
||||
using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
|
||||
using QYDataType = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
|
||||
using YScaleDataType = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
|
||||
using XDataType = ck_tile::remove_cvref_t<typename Problem::XDataType>;
|
||||
using SmoothScaleDataType = ck_tile::remove_cvref_t<typename Problem::SmoothScaleDataType>;
|
||||
using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
|
||||
using QYDataType = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
|
||||
using YScaleDataType = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
|
||||
|
||||
static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
|
||||
static constexpr bool kPadM = false; // TODO - BlockSmoothquantProblem::kPadM
|
||||
@@ -39,9 +39,12 @@ struct SmoothquantPipelineOnePass
|
||||
return Policy::template GetSmemSize<Problem>();
|
||||
}
|
||||
|
||||
template <typename XWindow, typename XScaleWindow, typename QYWindow, typename YScaleWindow>
|
||||
template <typename XWindow,
|
||||
typename SmoothScaleWindow,
|
||||
typename QYWindow,
|
||||
typename YScaleWindow>
|
||||
CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
|
||||
const XScaleWindow& xscale_window_,
|
||||
const SmoothScaleWindow& smscale_window_,
|
||||
YScaleWindow& yscale_window,
|
||||
QYWindow& qy_window,
|
||||
ck_tile::index_t,
|
||||
@@ -49,8 +52,8 @@ struct SmoothquantPipelineOnePass
|
||||
{
|
||||
auto x_window =
|
||||
make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
|
||||
auto xscale_window = make_tile_window(
|
||||
xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
|
||||
auto smscale_window = make_tile_window(
|
||||
smscale_window_, Policy::template MakeSmoothScaleBlockTileDistribution<Problem>());
|
||||
|
||||
auto reduce_absmax_func = ReduceOp::AbsMax{};
|
||||
auto reduce_absmax3_func = [](auto acc_, auto v_0_, auto v_1_) {
|
||||
@@ -67,14 +70,14 @@ struct SmoothquantPipelineOnePass
|
||||
auto block_reduce2d_cross_warp_sync =
|
||||
Policy::template GetBlockReduce2dCrossWarpSync<Problem>();
|
||||
|
||||
const auto x = load_tile(x_window);
|
||||
const auto xscale = load_tile(xscale_window);
|
||||
auto y = tile_elementwise_in(
|
||||
const auto x = load_tile(x_window);
|
||||
const auto smscale = load_tile(smscale_window);
|
||||
auto y = tile_elementwise_in(
|
||||
[&](const auto& a, const auto& b) {
|
||||
return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
|
||||
},
|
||||
x,
|
||||
xscale);
|
||||
smscale);
|
||||
|
||||
// compute absmax, cross-lane->cross-warp
|
||||
auto absmax = [&]() {
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -7,9 +7,9 @@
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
// Y = X * XScale, QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale)
|
||||
// Y = X * SmoothScale, QY = RowwiseDynamicQuant(Y) = SaturateCast(Y / YScale)
|
||||
template <typename XDataType_,
|
||||
typename XScaleDataType_,
|
||||
typename SmoothScaleDataType_,
|
||||
typename ComputeDataType_,
|
||||
typename YScaleDataType_,
|
||||
typename QYDataType_,
|
||||
@@ -18,12 +18,12 @@ template <typename XDataType_,
|
||||
bool kTwoPass_>
|
||||
struct SmoothquantPipelineProblem
|
||||
{
|
||||
using XDataType = remove_cvref_t<XDataType_>;
|
||||
using XScaleDataType = remove_cvref_t<XScaleDataType_>;
|
||||
using ComputeDataType = remove_cvref_t<ComputeDataType_>;
|
||||
using YScaleDataType = remove_cvref_t<YScaleDataType_>;
|
||||
using QYDataType = remove_cvref_t<QYDataType_>;
|
||||
using BlockShape = remove_cvref_t<BlockShape_>;
|
||||
using XDataType = remove_cvref_t<XDataType_>;
|
||||
using SmoothScaleDataType = remove_cvref_t<SmoothScaleDataType_>;
|
||||
using ComputeDataType = remove_cvref_t<ComputeDataType_>;
|
||||
using YScaleDataType = remove_cvref_t<YScaleDataType_>;
|
||||
using QYDataType = remove_cvref_t<QYDataType_>;
|
||||
using BlockShape = remove_cvref_t<BlockShape_>;
|
||||
|
||||
static constexpr bool kNeedCrossLaneSync = BlockShape::ThreadPerWarp_N > 1;
|
||||
static constexpr bool kNeedCrossWarpSync = BlockShape::WarpPerBlock_N > 1;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
@@ -16,11 +16,11 @@ struct SmoothquantPipelineTwoPass
|
||||
using Problem = ck_tile::remove_cvref_t<Problem_>;
|
||||
using Policy = ck_tile::remove_cvref_t<Policy_>;
|
||||
|
||||
using XDataType = ck_tile::remove_cvref_t<typename Problem::XDataType>;
|
||||
using XScaleDataType = ck_tile::remove_cvref_t<typename Problem::XScaleDataType>;
|
||||
using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
|
||||
using QYDataType = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
|
||||
using YScaleDataType = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
|
||||
using XDataType = ck_tile::remove_cvref_t<typename Problem::XDataType>;
|
||||
using SmoothScaleDataType = ck_tile::remove_cvref_t<typename Problem::SmoothScaleDataType>;
|
||||
using ComputeDataType = ck_tile::remove_cvref_t<typename Problem::ComputeDataType>;
|
||||
using QYDataType = ck_tile::remove_cvref_t<typename Problem::QYDataType>;
|
||||
using YScaleDataType = ck_tile::remove_cvref_t<typename Problem::YScaleDataType>;
|
||||
|
||||
static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
|
||||
static constexpr bool kPadM = false; // TODO - BlockSmoothquantProblem::kPadM
|
||||
@@ -39,9 +39,12 @@ struct SmoothquantPipelineTwoPass
|
||||
return Policy::template GetSmemSize<Problem>();
|
||||
}
|
||||
|
||||
template <typename XWindow, typename XScaleWindow, typename QYWindow, typename YScaleWindow>
|
||||
template <typename XWindow,
|
||||
typename SmoothScaleWindow,
|
||||
typename QYWindow,
|
||||
typename YScaleWindow>
|
||||
CK_TILE_DEVICE auto operator()(const XWindow& x_window_,
|
||||
const XScaleWindow& xscale_window_,
|
||||
const SmoothScaleWindow& smscale_window_,
|
||||
YScaleWindow& yscale_window,
|
||||
QYWindow& qy_window,
|
||||
ck_tile::index_t row_size,
|
||||
@@ -49,8 +52,8 @@ struct SmoothquantPipelineTwoPass
|
||||
{
|
||||
auto x_window =
|
||||
make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution<Problem>());
|
||||
auto xscale_window = make_tile_window(
|
||||
xscale_window_, Policy::template MakeXScaleBlockTileDistribution<Problem>());
|
||||
auto smscale_window = make_tile_window(
|
||||
smscale_window_, Policy::template MakeSmoothScaleBlockTileDistribution<Problem>());
|
||||
|
||||
static constexpr index_t Block_N = Problem::BlockShape::Block_N;
|
||||
index_t num_n_tile_iteration =
|
||||
@@ -76,14 +79,14 @@ struct SmoothquantPipelineTwoPass
|
||||
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
const auto x = load_tile(x_window);
|
||||
const auto xscale = load_tile(xscale_window);
|
||||
const auto y = tile_elementwise_in(
|
||||
const auto x = load_tile(x_window);
|
||||
const auto smscale = load_tile(smscale_window);
|
||||
const auto y = tile_elementwise_in(
|
||||
[&](const auto& a, const auto& b) {
|
||||
return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
|
||||
},
|
||||
x,
|
||||
xscale);
|
||||
smscale);
|
||||
|
||||
constexpr auto x_size_per_row =
|
||||
x.get_tile_distribution().get_ys_to_d_descriptor().get_lengths().at(number<1>{});
|
||||
@@ -94,7 +97,7 @@ struct SmoothquantPipelineTwoPass
|
||||
block_reduce2d(y, absmax, reduce_absmax_func);
|
||||
|
||||
move_tile_window(x_window, {0, Block_N});
|
||||
move_tile_window(xscale_window, {Block_N});
|
||||
move_tile_window(smscale_window, {Block_N});
|
||||
}
|
||||
|
||||
// compute absmax, cross-lane->cross-warp
|
||||
@@ -114,20 +117,20 @@ struct SmoothquantPipelineTwoPass
|
||||
row_size % Block_N == 0 ? row_size - Block_N : row_size - row_size % Block_N;
|
||||
|
||||
move_tile_window(x_window, {0, -Block_N});
|
||||
move_tile_window(xscale_window, {-Block_N});
|
||||
move_tile_window(smscale_window, {-Block_N});
|
||||
move_tile_window(qy_window, {0, stride_to_right_most_window});
|
||||
|
||||
// recompute y and quantize y to qy
|
||||
for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN)
|
||||
{
|
||||
const auto x = load_tile(x_window);
|
||||
const auto xscale = load_tile(xscale_window);
|
||||
const auto y = tile_elementwise_in(
|
||||
const auto x = load_tile(x_window);
|
||||
const auto smscale = load_tile(smscale_window);
|
||||
const auto y = tile_elementwise_in(
|
||||
[&](const auto& a, const auto& b) {
|
||||
return type_convert<ComputeDataType>(a) * type_convert<ComputeDataType>(b);
|
||||
},
|
||||
x,
|
||||
xscale);
|
||||
smscale);
|
||||
|
||||
auto qy = make_static_distributed_tensor<QYDataType>(y.get_tile_distribution());
|
||||
sweep_tile(qy, [&](auto idx) {
|
||||
@@ -138,7 +141,7 @@ struct SmoothquantPipelineTwoPass
|
||||
store_tile(qy_window, qy);
|
||||
|
||||
move_tile_window(x_window, {0, -Block_N});
|
||||
move_tile_window(xscale_window, {0, -Block_N});
|
||||
move_tile_window(smscale_window, {0, -Block_N});
|
||||
move_tile_window(qy_window, {0, -Block_N});
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user