mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 13:41:24 +00:00
[Ck_tile] hot fix, fix rpcf param setting err (#1657)
Co-authored-by: dummycoderfe <noplydummmycoder@163.com>
This commit is contained in:
@@ -121,7 +121,7 @@ struct Layernorm2dFwdPipelineOnePass
|
||||
auto [mean, var] = block_welford(acc, cur_count, max_count);
|
||||
block_welford_sync(mean, var, cur_count);
|
||||
block_welford_cross_warp_sync(mean, var, cur_count, smem);
|
||||
block_tile_welford_post_scale_var(var, cur_count);
|
||||
block_tile_welford_post_scale_var(var, cur_count, constant<kFastFDiv>{});
|
||||
|
||||
// compute inv-std
|
||||
auto inv_std = tile_elementwise_in(
|
||||
|
||||
@@ -35,6 +35,7 @@ struct Layernorm2dFwdPipelineTwoPass
|
||||
static constexpr bool kNeedCrossWarpSync = Problem::kNeedCrossWarpSync;
|
||||
static constexpr bool kPadM = false; // TODO - BlockLayernorm2dFwdProblem::kPadM
|
||||
static constexpr bool kPadN = Problem::Traits::kPadN;
|
||||
static constexpr bool kFastFDiv = Problem::Traits::kFastFDiv;
|
||||
static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd;
|
||||
static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant;
|
||||
|
||||
@@ -137,15 +138,22 @@ struct Layernorm2dFwdPipelineTwoPass
|
||||
|
||||
block_welford_sync(mean, var, cur_count);
|
||||
block_welford_cross_warp_sync(mean, var, cur_count, smem);
|
||||
block_tile_welford_post_scale_var(var, cur_count);
|
||||
block_tile_welford_post_scale_var(var, cur_count, constant<kFastFDiv>{});
|
||||
|
||||
// compute inv-std
|
||||
auto inv_std = tile_elementwise_in(
|
||||
[&](const auto& v_) {
|
||||
return type_convert<ComputeDataType>(1.0f) / (sqrt(v_ + epsilon));
|
||||
if(kFastFDiv && std::is_same_v<ComputeDataType, float>)
|
||||
{
|
||||
return type_convert<ComputeDataType>(1.0f) *
|
||||
__builtin_amdgcn_rcpf(sqrt(v_ + epsilon));
|
||||
}
|
||||
else
|
||||
{
|
||||
return type_convert<ComputeDataType>(1.0f) / sqrt(v_ + epsilon);
|
||||
}
|
||||
},
|
||||
var);
|
||||
|
||||
if constexpr(kSaveMean)
|
||||
store_tile(mean_window, cast_tile<MeanDataType>(mean));
|
||||
if constexpr(kSaveInvStd)
|
||||
|
||||
@@ -47,8 +47,11 @@ struct BlockWelford
|
||||
|
||||
auto x = ck_tile::type_convert<ComputeDataType>(x_tensor[in_dstr_idx]);
|
||||
|
||||
welford_update(
|
||||
mean_tensor(out_dstr_idx), var_tensor(out_dstr_idx), x, cur_count_);
|
||||
welford_update(mean_tensor(out_dstr_idx),
|
||||
var_tensor(out_dstr_idx),
|
||||
x,
|
||||
cur_count_,
|
||||
constant<kFastFDiv>{});
|
||||
});
|
||||
}
|
||||
});
|
||||
@@ -159,7 +162,8 @@ struct BlockWelfordSync
|
||||
v_local_count,
|
||||
v_remote_mean,
|
||||
v_remote_var,
|
||||
v_remote_count);
|
||||
v_remote_count,
|
||||
constant<kFastFDiv>{});
|
||||
});
|
||||
}
|
||||
});
|
||||
@@ -307,7 +311,8 @@ struct BlockWelfordCrossWarpSync
|
||||
v_local_count,
|
||||
v_remote_mean,
|
||||
v_remote_var,
|
||||
v_remote_count);
|
||||
v_remote_count,
|
||||
constant<kFastFDiv>{});
|
||||
});
|
||||
|
||||
mean_tensor.get_thread_buffer()(i_0) = v_local_mean;
|
||||
|
||||
Reference in New Issue
Block a user