Revert "Revert " Fp8 block scale quantization for fmha fwd (#3330)" (#3633)" (#3635)

This reverts commit de5a1d730d.

Co-authored-by: Illia Silin <98187287+illsilin@users.noreply.github.com>
This commit is contained in:
ltqin
2026-01-24 01:03:22 +08:00
committed by GitHub
parent 2e08a7e5ab
commit 67f0b74ec6
14 changed files with 667 additions and 84 deletions

View File

@@ -57,8 +57,13 @@ struct BlockFmhaPipelineQRKSVS
static constexpr auto BiasEnum = Problem::BiasEnum;
static constexpr bool kStoreLSE = Problem::kStoreLSE;
static constexpr bool kHasDropout = Problem::kHasDropout;
static constexpr auto QScaleEnum = Problem::QScaleEnum;
static constexpr bool kHasSink = Problem::kHasSink;
// For BLOCKSCALE: shift value for exp2(x + shift) to scale P to [0, 2^shift]
static constexpr float OCP_FP8_SHIFT = 8.0f;
static constexpr float FNUZ_FP8_SHIFT = 7.0f;
static constexpr uint32_t DS_READ = 0x100; // Barrier for DS (data share) read
static constexpr uint32_t MFMA = 0x008; // Barrier for MFMA (matrix multiply-accumulate)
@@ -167,6 +172,9 @@ struct BlockFmhaPipelineQRKSVS
const BlockIndices& block_indices,
void* smem_ptr,
DropoutType& dropout,
const float* k_descale_ptr,
const float* v_descale_ptr,
const index_t block_scale_size_kv,
const float sink_v) const
{
static_assert(
@@ -358,6 +366,13 @@ struct BlockFmhaPipelineQRKSVS
static_assert(1 <= k1_loops);
do
{
float k_descale = 1.0f;
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
// K and V share the same seqlen_k position within a block
const index_t kv_idx = (kv_load_start + i_total_loops * kN0) / block_scale_size_kv;
k_descale = k_descale_ptr[kv_idx];
}
// STAGE 1, QK gemm
auto k_dram_window = make_tile_window(
k_dram_block_window.get_bottom_tensor_view(),
@@ -427,11 +442,20 @@ struct BlockFmhaPipelineQRKSVS
k_lds_window);
schedule_gemm0();
}
// dequant
auto s_acc_element_func_ = [&s_acc_element_func, k_descale]() {
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
return s_acc_element_func * k_descale;
}
else
return s_acc_element_func;
}();
// STAGE 2, scale_s, add bias, mask, softmax
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
{
s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
s_acc = tile_elementwise_in(s_acc_element_func_, s_acc);
tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
tile_elementwise_inout(
[&](auto& x, const auto& y) {
@@ -449,7 +473,7 @@ struct BlockFmhaPipelineQRKSVS
{
const auto k_origin = k_dram_block_window.get_window_origin();
constexpr auto s_spans = decltype(s_acc)::get_distributed_spans();
s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
s_acc = tile_elementwise_in(s_acc_element_func_, s_acc);
sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) {
const auto tile_idx = get_x_indices_from_distributed_indices(
@@ -466,7 +490,7 @@ struct BlockFmhaPipelineQRKSVS
}
else
{
s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
s_acc = tile_elementwise_in(s_acc_element_func_, s_acc);
if constexpr(kHasLogitsSoftCap)
{
auto apply_logits_transform =
@@ -571,7 +595,21 @@ struct BlockFmhaPipelineQRKSVS
sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) {
constexpr auto i_idx = make_tuple(idx0);
#if CK_TILE_FMHA_FWD_FAST_EXP2
auto row_max = scale_s * get_validated_m(m[i_idx]);
// For BLOCKSCALE: precompute (m - shift) once per row
// Bias/Alibi/SoftCap: exp2(s - m + shift) = exp2(s - (m - shift))
// else: exp2(scale_s*s - scale_s*m + shift) = exp2(scale_s*s - (scale_s*m - shift))
auto validated_m = get_validated_m(m[i_idx]);
auto row_max = scale_s * validated_m;
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
#if CK_TILE_USE_OCP_FP8
validated_m -= OCP_FP8_SHIFT; // for Bias/Alibi/SoftCap
row_max -= OCP_FP8_SHIFT; // for else branch
#else
validated_m -= FNUZ_FP8_SHIFT;
row_max -= FNUZ_FP8_SHIFT;
#endif
}
#endif
sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) {
constexpr auto i_j_idx = make_tuple(idx0, idx1);
@@ -579,13 +617,13 @@ struct BlockFmhaPipelineQRKSVS
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
BiasEnum == BlockAttentionBiasEnum::ALIBI)
{
p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
p_compute(i_j_idx) = exp2(s[i_j_idx] - validated_m);
}
else
{
if constexpr(kHasLogitsSoftCap)
{
p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
p_compute(i_j_idx) = exp2(s[i_j_idx] - validated_m);
}
else
{
@@ -676,18 +714,39 @@ struct BlockFmhaPipelineQRKSVS
store_tile(v_lds_window,
tile_elementwise_in(v_element_func, v_prefetch)); // store the prefetch
}
move_tile_window(v_dram_window, {0, kK1});
const auto p =
cast_tile<PDataType>(tile_elementwise_in(p_compute_element_func, p_compute));
float v_descale = 1.0f;
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
// K and V share the same seqlen_k position within a block
const index_t kv_idx = (kv_load_start + i_total_loops * kN0) / block_scale_size_kv;
v_descale = v_descale_ptr[kv_idx];
}
// STAGE 3, KV gemm
auto o_acc0 = decltype(o_acc){};
clear_tile(o_acc0);
auto& o_acc_ = [&o_acc0, &o_acc]() -> auto& {
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
return o_acc0;
}
else
{
return o_acc;
}
}();
if constexpr(k1_loops > 1)
{
static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
const auto v = load_tile(v_dram_window); // load next v
block_sync_lds();
gemm_1(o_acc,
gemm_1(o_acc_,
get_slice_tile(
p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
v_lds_window);
@@ -722,11 +781,16 @@ struct BlockFmhaPipelineQRKSVS
// tail
{
block_sync_lds();
gemm_1(o_acc,
gemm_1(o_acc_,
get_slice_tile(p, sequence<0, (k1_loops - 1) * kK1>{}, sequence<kM0, kN0>{}),
v_lds_window);
block_sync_lds();
}
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
tile_elementwise_inout(
[&v_descale](auto& o, auto& o0) { o += o0 * v_descale; }, o_acc, o_acc0);
}
} while(++i_total_loops < num_total_loop);
// store lse
@@ -846,6 +910,9 @@ struct BlockFmhaPipelineQRKSVS
block_indices,
smem_ptr,
dropout,
nullptr,
nullptr,
1,
sink_v);
}
};

View File

@@ -46,6 +46,7 @@ struct BlockFmhaPipelineQRKSVSAsync
static constexpr index_t kK1 = BlockFmhaShape::kK1;
static constexpr index_t kQKHeaddim = BlockFmhaShape::kQKHeaddim;
static constexpr index_t kSubQKHeaddim = BlockFmhaShape::kSubQKHeaddim;
static constexpr auto QScaleEnum = Problem::QScaleEnum;
static_assert(kSubQKHeaddim <= 256, "hdim bigger than 256 is not suitable for this pipeline!");
@@ -64,6 +65,10 @@ struct BlockFmhaPipelineQRKSVSAsync
static constexpr bool kHasDropout = Problem::kHasDropout;
static constexpr bool kHasSink = Problem::kHasSink;
// For BLOCKSCALE: shift value for exp2(x + shift) to scale P to [0, 2^shift]
static constexpr float OCP_FP8_SHIFT = 8.0f;
static constexpr float FNUZ_FP8_SHIFT = 7.0f;
static_assert((CK_TILE_FMHA_FWD_FAST_EXP2 &&
(kHasLogitsSoftCap && Problem::BiasEnum == BlockAttentionBiasEnum::NO_BIAS ||
!kHasLogitsSoftCap)) ||
@@ -190,6 +195,9 @@ struct BlockFmhaPipelineQRKSVSAsync
const BlockIndices& block_indices,
void* smem_ptr,
DropoutType& dropout,
const float* k_descale_ptr,
const float* v_descale_ptr,
const index_t block_scale_size_kv,
const float sink_v) const
{
static_assert(
@@ -403,6 +411,13 @@ struct BlockFmhaPipelineQRKSVSAsync
// main loop
do
{
float k_descale = 1.0f;
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
// K and V share the same seqlen_k position within a block
const index_t kv_idx = (kv_load_start + i_total_loops * kN0) / block_scale_size_kv;
k_descale = k_descale_ptr[kv_idx];
}
// STAGE 1, QK gemm
clear_tile(s_acc); // initialize C
if constexpr(k0_loops > 1)
@@ -449,11 +464,20 @@ struct BlockFmhaPipelineQRKSVSAsync
sequence<(LdsSeq.at(number<k0_loops - 1>{}) + 1) * kN0, kK0>{}));
}
__builtin_amdgcn_sched_barrier(1);
// dequant
auto s_acc_element_func_ = [&s_acc_element_func, k_descale]() {
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
return s_acc_element_func * k_descale;
}
else
return s_acc_element_func;
}();
// STAGE 2, scale_s, add bias, mask, softmax
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS)
{
s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
s_acc = tile_elementwise_in(s_acc_element_func_, s_acc);
tile_elementwise_inout([&scale_s](auto& x) { x = x * scale_s; }, s_acc);
tile_elementwise_inout(
[&](auto& x, const auto& y) {
@@ -471,7 +495,7 @@ struct BlockFmhaPipelineQRKSVSAsync
{
const auto k_origin = k_dram_block_window.get_window_origin();
constexpr auto s_spans = decltype(s_acc)::get_distributed_spans();
s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
s_acc = tile_elementwise_in(s_acc_element_func_, s_acc);
sweep_tile_span(s_spans[number<0>{}], [&](auto idx0) {
sweep_tile_span(s_spans[number<1>{}], [&](auto idx1) {
const auto tile_idx = get_x_indices_from_distributed_indices(
@@ -488,7 +512,7 @@ struct BlockFmhaPipelineQRKSVSAsync
}
else
{
s_acc = tile_elementwise_in(s_acc_element_func, s_acc);
s_acc = tile_elementwise_in(s_acc_element_func_, s_acc);
if constexpr(kHasLogitsSoftCap)
{
auto apply_logits_transform =
@@ -630,7 +654,21 @@ struct BlockFmhaPipelineQRKSVSAsync
sweep_tile_span(p_spans[number<0>{}], [&](auto idx0) {
constexpr auto i_idx = make_tuple(idx0);
#if CK_TILE_FMHA_FWD_FAST_EXP2
auto row_max = scale_s * get_validated_m(m[i_idx]);
// For BLOCKSCALE: precompute (m - shift) once per row
// Bias/Alibi/SoftCap: exp2(s - m + shift) = exp2(s - (m - shift))
// else: exp2(scale_s*s - scale_s*m + shift) = exp2(scale_s*s - (scale_s*m - shift))
auto validated_m = get_validated_m(m[i_idx]);
auto row_max = scale_s * validated_m;
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
#if CK_TILE_USE_OCP_FP8
validated_m -= OCP_FP8_SHIFT; // for Bias/Alibi/SoftCap
row_max -= OCP_FP8_SHIFT; // for else branch
#else
validated_m -= FNUZ_FP8_SHIFT;
row_max -= FNUZ_FP8_SHIFT;
#endif
}
#endif
sweep_tile_span(p_spans[number<1>{}], [&](auto idx1) {
constexpr auto i_j_idx = make_tuple(idx0, idx1);
@@ -638,13 +676,13 @@ struct BlockFmhaPipelineQRKSVSAsync
if constexpr(BiasEnum == BlockAttentionBiasEnum::ELEMENTWISE_BIAS ||
BiasEnum == BlockAttentionBiasEnum::ALIBI)
{
p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
p_compute(i_j_idx) = exp2(s[i_j_idx] - validated_m);
}
else
{
if constexpr(kHasLogitsSoftCap)
{
p_compute(i_j_idx) = exp2(s[i_j_idx] - get_validated_m(m[i_idx]));
p_compute(i_j_idx) = exp2(s[i_j_idx] - validated_m);
}
else
{
@@ -735,7 +773,27 @@ struct BlockFmhaPipelineQRKSVSAsync
#endif
}();
float v_descale = 1.0f;
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
// K and V share the same seqlen_k position within a block
const index_t kv_idx = (kv_load_start + i_total_loops * kN0) / block_scale_size_kv;
v_descale = v_descale_ptr[kv_idx];
}
// STAGE 3, KV gemm
auto o_acc0 = decltype(o_acc){};
clear_tile(o_acc0);
auto& o_acc_ = [&o_acc0, &o_acc]() -> auto& {
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
return o_acc0;
}
else
{
return o_acc;
}
}();
if constexpr(k1_loops > 1)
{
static_for<0, k1_loops - 1, 1>{}([&](auto i_k1) {
@@ -745,7 +803,7 @@ struct BlockFmhaPipelineQRKSVSAsync
v_dram_window, number<-1>{}, bool_constant<false>{}); // load next v_buf
}
block_sync_lds();
gemm_1(o_acc,
gemm_1(o_acc_,
get_slice_tile(
p, sequence<0, i_k1 * kK1>{}, sequence<kM0, (i_k1 + 1) * kK1>{}),
get_slice_tile(
@@ -808,13 +866,19 @@ struct BlockFmhaPipelineQRKSVSAsync
{
block_sync_lds();
gemm_1(
o_acc,
o_acc_,
get_slice_tile(p, sequence<0, (k1_loops - 1) * kK1>{}, sequence<kM0, kN0>{}),
get_slice_tile(
v_lds_window,
sequence<(LdsSeq.at(number<k0_loops + k1_loops - 1>{})) * kN1, 0>{},
sequence<(LdsSeq.at(number<k0_loops + k1_loops - 1>{}) + 1) * kN1, kK1>{}));
}
if constexpr(QScaleEnum == BlockAttentionQuantScaleEnum::BLOCKSCALE)
{
tile_elementwise_inout(
[&v_descale](auto& o, auto& o0) { o += o0 * v_descale; }, o_acc, o_acc0);
}
} while(i_total_loops < num_total_loop);
// store lse
@@ -922,6 +986,9 @@ struct BlockFmhaPipelineQRKSVSAsync
block_indices,
smem_ptr,
dropout,
nullptr,
nullptr,
1,
sink_v);
}
};