mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-04 05:31:24 +00:00
[Ck tile] layernorm2d fwd optimize (#1637)
* optimze small N case using vec io and using rcp div * [Ck_tile] layernorm, add param to control fastdiv; change generate codes and test pass * [Ck_tile] fix blockSize compute in Generic2dBlockShape * [Ck_tile]fix kfastfdiv template style * [Ck_tile] layernorm, fix stype in review --------- Co-authored-by: dummycoderfe <noplydummmycoder@163.com>
This commit is contained in:
@@ -7,25 +7,46 @@
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
template <typename T>
|
||||
CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count)
|
||||
template <typename T, bool kFastFDiv = false>
|
||||
CK_TILE_DEVICE void welford_update(T& mean, T& var, T x, int count, bool_constant<kFastFDiv> = {})
|
||||
{
|
||||
// TODO: check nan? maybe no
|
||||
T delta = x - mean;
|
||||
mean += delta / count;
|
||||
if(kFastFDiv && std::is_same_v<T, float>)
|
||||
{
|
||||
mean += delta * __builtin_amdgcn_rcpf(count);
|
||||
}
|
||||
else
|
||||
{
|
||||
mean += delta / count;
|
||||
}
|
||||
T delta2 = x - mean;
|
||||
var += delta * delta2;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
CK_TILE_DEVICE static void
|
||||
welford_merge(T& mean_a, T& var_a, int& count_a, T mean_b, T var_b, int count_b)
|
||||
template <typename T, bool kFastFDiv = false>
|
||||
CK_TILE_DEVICE static void welford_merge(T& mean_a,
|
||||
T& var_a,
|
||||
int& count_a,
|
||||
T mean_b,
|
||||
T var_b,
|
||||
int count_b,
|
||||
bool_constant<kFastFDiv> = {})
|
||||
{
|
||||
int count = count_a + count_b;
|
||||
T count_ = type_convert<T>(count);
|
||||
T count_a_ = type_convert<T>(count_a);
|
||||
T count_b_ = type_convert<T>(count_b);
|
||||
T count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
|
||||
int count = count_a + count_b;
|
||||
T count_ = type_convert<T>(count);
|
||||
T count_a_ = type_convert<T>(count_a);
|
||||
T count_b_ = type_convert<T>(count_b);
|
||||
T count_b_over_count;
|
||||
if(kFastFDiv && std::is_same_v<T, float>)
|
||||
{
|
||||
count_b_over_count =
|
||||
count == 0 ? type_convert<T>(0) : count_b_ * __builtin_amdgcn_rcpf(count_);
|
||||
}
|
||||
else
|
||||
{
|
||||
count_b_over_count = count == 0 ? type_convert<T>(0) : count_b_ / count_;
|
||||
}
|
||||
|
||||
T delta = mean_b - mean_a;
|
||||
mean_a += delta * count_b_over_count;
|
||||
|
||||
Reference in New Issue
Block a user