mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-05 14:11:29 +00:00
Layernorm and groupnorm support to save mean and inverse std in forward (#929)
* save mean and inverse std in normalization * Save mean and inverse std in splitK * Vector save mean and inv std * Modify instance for save mean and std * simplify the layernorm example * Save mean and std in groupnorm example * Save mean and inv std in ckProfiler and test * Remove compute data type from base class * Save mean and inv std in client example * Add changelog * clang format * Fix compile error * Refine naming * Avoid error in bf16 * revert changelog
This commit is contained in:
@@ -18,9 +18,11 @@ template <typename XDataType,
|
||||
typename GammaDataType,
|
||||
typename BetaDataType,
|
||||
typename YDataType,
|
||||
typename SaveMeanInvStdDataType,
|
||||
typename ComputeDataType,
|
||||
typename YElementwiseOperation,
|
||||
typename GridDesc_M_K,
|
||||
typename GridDesc_M,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
@@ -34,6 +36,7 @@ template <typename XDataType,
|
||||
index_t BetaSrcVectorSize,
|
||||
index_t YDstVectorDim,
|
||||
index_t YDstVectorSize,
|
||||
index_t SaveMeanInvStdDstVectorSize,
|
||||
bool SweepOnce>
|
||||
struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
{
|
||||
@@ -45,6 +48,10 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
(YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static_assert(MThreadSliceSize % SaveMeanInvStdDstVectorSize == 0,
|
||||
"Invalid thread slice sizes and/or save mean and inverse std vector sizes "
|
||||
"configuration, please check!");
|
||||
|
||||
static_assert(XSrcVectorSize == YDstVectorSize);
|
||||
static_assert(XSrcVectorSize == GammaSrcVectorSize);
|
||||
static_assert(XSrcVectorSize == BetaSrcVectorSize);
|
||||
@@ -66,6 +73,10 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
|
||||
|
||||
using ThreadBufferLengths_M = Sequence<MThreadSliceSize>;
|
||||
static constexpr auto thread_buffer_desc_m =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
|
||||
|
||||
using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{})));
|
||||
using ThreadReduceDstDesc_M =
|
||||
@@ -84,6 +95,8 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
reduce::Add,
|
||||
true>;
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
static constexpr auto I2 = Number<2>{};
|
||||
@@ -98,12 +111,16 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
const GridDesc_M_K& gamma_grid_desc_m_k,
|
||||
const GridDesc_M_K& beta_grid_desc_m_k,
|
||||
const GridDesc_M_K& y_grid_desc_m_k,
|
||||
const GridDesc_M& save_mean_grid_desc_m,
|
||||
const GridDesc_M& save_inv_std_grid_desc_m,
|
||||
index_t num_k_block_tile_iteration,
|
||||
ComputeDataType epsilon,
|
||||
const XDataType* const __restrict__ p_x_global,
|
||||
const GammaDataType* const __restrict__ p_gamma_global,
|
||||
const BetaDataType* const __restrict__ p_beta_global,
|
||||
YDataType* const __restrict__ p_y_global,
|
||||
SaveMeanInvStdDataType* const __restrict__ p_save_mean_global,
|
||||
SaveMeanInvStdDataType* const __restrict__ p_save_inv_std_global,
|
||||
const YElementwiseOperation y_elementwise_op)
|
||||
{
|
||||
// LDS
|
||||
@@ -115,6 +132,12 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto save_mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_save_mean_global, save_mean_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto save_inv_std_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_save_inv_std_global, save_inv_std_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto x_thread_buf = generate_tuple(
|
||||
[&](auto) {
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr,
|
||||
@@ -152,6 +175,8 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
mean_square_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>&
|
||||
var_thread_buf = mean_square_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>&
|
||||
inv_std_thread_buf = mean_square_thread_buf;
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_id = get_block_1d_id();
|
||||
@@ -228,6 +253,42 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
thread_k_cluster_id * YDstVectorSize),
|
||||
y_elementwise_op);
|
||||
|
||||
auto threadwise_mean_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
GridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
SaveMeanInvStdDstVectorSize, // ScalarPerVector
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
save_mean_grid_desc_m,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
auto threadwise_inv_std_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
GridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
SaveMeanInvStdDstVectorSize, // ScalarPerVector
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
save_inv_std_grid_desc_m,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
|
||||
constexpr auto thread_copy_bwd_step_m_k =
|
||||
make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize);
|
||||
@@ -243,7 +304,8 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
|
||||
// E(x), E[x^2], var(x)
|
||||
// FIXME: Should not hack the transform from deviceOP
|
||||
int reduce_length = x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0];
|
||||
ComputeDataType reduce_length = type_convert<ComputeDataType>(
|
||||
x_grid_desc_m_k.GetTransforms()[I2].GetUpperLengths()[I0]);
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto I) {
|
||||
mean_thread_buf(I) = reduce::Add::template GetIdentityValue<ComputeDataType>();
|
||||
@@ -302,10 +364,34 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
// var(x) = E[x^2] - E[x]^2
|
||||
var_thread_buf(I) =
|
||||
mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I));
|
||||
|
||||
inv_std_thread_buf(I) = type_convert<ComputeDataType>(1.0f) /
|
||||
ck::math::sqrt(var_thread_buf(I) + epsilon);
|
||||
});
|
||||
|
||||
// save mean and inverse std for backward (optional)
|
||||
if(thread_k_cluster_id == 0)
|
||||
{
|
||||
if(p_save_mean_global != nullptr)
|
||||
{
|
||||
threadwise_mean_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
mean_thread_buf,
|
||||
save_mean_grid_desc_m,
|
||||
save_mean_global_val_buf);
|
||||
}
|
||||
if(p_save_inv_std_global != nullptr)
|
||||
{
|
||||
threadwise_inv_std_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
inv_std_thread_buf,
|
||||
save_inv_std_grid_desc_m,
|
||||
save_inv_std_global_val_buf);
|
||||
}
|
||||
}
|
||||
|
||||
// normalization
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
|
||||
static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
|
||||
static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
|
||||
constexpr auto offset_m_k =
|
||||
@@ -314,7 +400,7 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
// normalize
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
(x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
|
||||
divisor;
|
||||
inv_std_thread_buf(iM);
|
||||
|
||||
// gamma & beta
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
@@ -404,8 +490,30 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
// var(x) = E[x^2] - E[x]^2
|
||||
var_thread_buf(I) =
|
||||
mean_square_thread_buf(I) - (mean_thread_buf(I) * mean_thread_buf(I));
|
||||
|
||||
inv_std_thread_buf(I) = 1 / ck::math::sqrt(var_thread_buf(I) + epsilon);
|
||||
});
|
||||
|
||||
if(thread_k_cluster_id == 0)
|
||||
{
|
||||
if(p_save_mean_global != nullptr)
|
||||
{
|
||||
threadwise_mean_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
mean_thread_buf,
|
||||
save_mean_grid_desc_m,
|
||||
save_mean_global_val_buf);
|
||||
}
|
||||
if(p_save_inv_std_global != nullptr)
|
||||
{
|
||||
threadwise_inv_std_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
inv_std_thread_buf,
|
||||
save_inv_std_grid_desc_m,
|
||||
save_inv_std_global_val_buf);
|
||||
}
|
||||
}
|
||||
|
||||
auto thread_copy_tail_m_k =
|
||||
(num_k_block_tile_iteration - 1) * ThreadBufferNumber * thread_copy_fwd_step_m_k;
|
||||
|
||||
@@ -437,7 +545,6 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
});
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
|
||||
static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
|
||||
static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
|
||||
constexpr auto offset_m_k =
|
||||
@@ -446,7 +553,7 @@ struct GridwiseNormalizationNaiveVariance_mk_to_mk
|
||||
// normalize
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
(x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
|
||||
divisor;
|
||||
inv_std_thread_buf(iM);
|
||||
|
||||
// gamma
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
|
||||
@@ -12,31 +12,42 @@ template <typename GridwiseReduction,
|
||||
typename GammaDataType,
|
||||
typename BetaDataType,
|
||||
typename YDataType,
|
||||
typename SaveMeanInvStdDataType,
|
||||
typename ComputeDataType,
|
||||
typename YElementwiseOperation,
|
||||
typename GridDesc_M_K>
|
||||
__global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k,
|
||||
const GridDesc_M_K gamma_grid_desc_m_k,
|
||||
const GridDesc_M_K beta_grid_desc_m_k,
|
||||
const GridDesc_M_K y_grid_desc_m_k,
|
||||
index_t num_k_block_tile_iteration,
|
||||
ComputeDataType epsilon,
|
||||
const XDataType* const __restrict__ p_x_global,
|
||||
const GammaDataType* const __restrict__ p_gamma_global,
|
||||
const BetaDataType* const __restrict__ p_beta_global,
|
||||
YDataType* const __restrict__ p_y_global,
|
||||
const YElementwiseOperation y_elementwise_op)
|
||||
typename GridDesc_M_K,
|
||||
typename GridDesc_M>
|
||||
__global__ void
|
||||
kernel_normalization(const GridDesc_M_K x_grid_desc_m_k,
|
||||
const GridDesc_M_K gamma_grid_desc_m_k,
|
||||
const GridDesc_M_K beta_grid_desc_m_k,
|
||||
const GridDesc_M_K y_grid_desc_m_k,
|
||||
const GridDesc_M save_mean_grid_desc_m,
|
||||
const GridDesc_M save_inv_std_grid_desc_m,
|
||||
index_t num_k_block_tile_iteration,
|
||||
ComputeDataType epsilon,
|
||||
const XDataType* const __restrict__ p_x_global,
|
||||
const GammaDataType* const __restrict__ p_gamma_global,
|
||||
const BetaDataType* const __restrict__ p_beta_global,
|
||||
YDataType* const __restrict__ p_y_global,
|
||||
SaveMeanInvStdDataType* const __restrict__ p_save_mean_global,
|
||||
SaveMeanInvStdDataType* const __restrict__ p_save_inv_std_global,
|
||||
const YElementwiseOperation y_elementwise_op)
|
||||
{
|
||||
GridwiseReduction::Run(x_grid_desc_m_k,
|
||||
gamma_grid_desc_m_k,
|
||||
beta_grid_desc_m_k,
|
||||
y_grid_desc_m_k,
|
||||
save_mean_grid_desc_m,
|
||||
save_inv_std_grid_desc_m,
|
||||
num_k_block_tile_iteration,
|
||||
epsilon,
|
||||
p_x_global,
|
||||
p_gamma_global,
|
||||
p_beta_global,
|
||||
p_y_global,
|
||||
p_save_mean_global,
|
||||
p_save_inv_std_global,
|
||||
y_elementwise_op);
|
||||
};
|
||||
|
||||
@@ -44,9 +55,11 @@ template <typename XDataType,
|
||||
typename GammaDataType,
|
||||
typename BetaDataType,
|
||||
typename YDataType,
|
||||
typename SaveMeanInvStdDataType,
|
||||
typename ComputeDataType,
|
||||
typename YElementwiseOperation,
|
||||
typename GridDesc_M_K,
|
||||
typename GridDesc_M,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
@@ -60,6 +73,7 @@ template <typename XDataType,
|
||||
index_t BetaSrcVectorSize,
|
||||
index_t YDstVectorDim,
|
||||
index_t YDstVectorSize,
|
||||
index_t SaveMeanInvStdDstVectorSize,
|
||||
bool UseWelford>
|
||||
auto NormalizationKernelSelector(bool isSweepOnce)
|
||||
{
|
||||
@@ -68,9 +82,11 @@ auto NormalizationKernelSelector(bool isSweepOnce)
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
YElementwiseOperation,
|
||||
GridDesc_M_K,
|
||||
GridDesc_M,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
@@ -84,15 +100,18 @@ auto NormalizationKernelSelector(bool isSweepOnce)
|
||||
BetaSrcVectorSize,
|
||||
YDstVectorDim,
|
||||
YDstVectorSize,
|
||||
SaveMeanInvStdDstVectorSize,
|
||||
false>;
|
||||
using GridwiseNormalizationSweepOnceNaive =
|
||||
GridwiseNormalizationNaiveVariance_mk_to_mk<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
YElementwiseOperation,
|
||||
GridDesc_M_K,
|
||||
GridDesc_M,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
@@ -106,15 +125,18 @@ auto NormalizationKernelSelector(bool isSweepOnce)
|
||||
BetaSrcVectorSize,
|
||||
YDstVectorDim,
|
||||
YDstVectorSize,
|
||||
SaveMeanInvStdDstVectorSize,
|
||||
true>;
|
||||
using GridwiseNormalizationGenericWelford =
|
||||
GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
YElementwiseOperation,
|
||||
GridDesc_M_K,
|
||||
GridDesc_M,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
@@ -128,15 +150,18 @@ auto NormalizationKernelSelector(bool isSweepOnce)
|
||||
BetaSrcVectorSize,
|
||||
YDstVectorDim,
|
||||
YDstVectorSize,
|
||||
SaveMeanInvStdDstVectorSize,
|
||||
false>;
|
||||
using GridwiseNormalizationSweepOnceWelford =
|
||||
GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
YElementwiseOperation,
|
||||
GridDesc_M_K,
|
||||
GridDesc_M,
|
||||
BlockSize,
|
||||
MThreadClusterSize,
|
||||
KThreadClusterSize,
|
||||
@@ -150,6 +175,7 @@ auto NormalizationKernelSelector(bool isSweepOnce)
|
||||
BetaSrcVectorSize,
|
||||
YDstVectorDim,
|
||||
YDstVectorSize,
|
||||
SaveMeanInvStdDstVectorSize,
|
||||
true>;
|
||||
|
||||
if constexpr(UseWelford)
|
||||
@@ -159,17 +185,21 @@ auto NormalizationKernelSelector(bool isSweepOnce)
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
YElementwiseOperation,
|
||||
GridDesc_M_K>
|
||||
GridDesc_M_K,
|
||||
GridDesc_M>
|
||||
: kernel_normalization<GridwiseNormalizationGenericWelford,
|
||||
XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
YElementwiseOperation,
|
||||
GridDesc_M_K>;
|
||||
GridDesc_M_K,
|
||||
GridDesc_M>;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -178,17 +208,21 @@ auto NormalizationKernelSelector(bool isSweepOnce)
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
YElementwiseOperation,
|
||||
GridDesc_M_K>
|
||||
GridDesc_M_K,
|
||||
GridDesc_M>
|
||||
: kernel_normalization<GridwiseNormalizationGenericNaive,
|
||||
XDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
YDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
ComputeDataType,
|
||||
YElementwiseOperation,
|
||||
GridDesc_M_K>;
|
||||
GridDesc_M_K,
|
||||
GridDesc_M>;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -17,11 +17,13 @@ template <typename MeanVarDataType,
|
||||
typename GammaDataType,
|
||||
typename BetaDataType,
|
||||
typename YDataType,
|
||||
typename SaveMeanInvStdDataType,
|
||||
typename ComputeDataType,
|
||||
typename YElementwiseOperation,
|
||||
typename MeanVarGridDesc_M_KBlock,
|
||||
typename CountGridDesc_M_KBlock,
|
||||
typename XYGammaBetaGridDesc_M_K,
|
||||
typename SaveMeanInvStdGridDesc_M,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
@@ -34,7 +36,8 @@ template <typename MeanVarDataType,
|
||||
index_t BetaSrcVectorDim,
|
||||
index_t BetaSrcVectorSize,
|
||||
index_t YDstVectorDim,
|
||||
index_t YDstVectorSize>
|
||||
index_t YDstVectorSize,
|
||||
index_t SaveMeanInvStdDstVectorSize>
|
||||
struct GridwiseNormalizationSplitK2nd
|
||||
{
|
||||
static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
|
||||
@@ -45,6 +48,10 @@ struct GridwiseNormalizationSplitK2nd
|
||||
(YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static_assert(MThreadSliceSize % SaveMeanInvStdDstVectorSize == 0,
|
||||
"Invalid thread slice sizes and/or save mean and inverse std vector sizes "
|
||||
"configuration, please check!");
|
||||
|
||||
static_assert(XSrcVectorSize == YDstVectorSize);
|
||||
static_assert(XSrcVectorSize == GammaSrcVectorSize);
|
||||
static_assert(XSrcVectorSize == BetaSrcVectorSize);
|
||||
@@ -69,6 +76,10 @@ struct GridwiseNormalizationSplitK2nd
|
||||
static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
|
||||
|
||||
using ThreadBufferLengths_M = Sequence<MThreadSliceSize>;
|
||||
static constexpr auto thread_buffer_desc_m =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
|
||||
|
||||
using ThreadBufferLengths_M_1 = Sequence<MThreadSliceSize, 1>;
|
||||
static constexpr auto thread_buffer_desc_m_1 =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}, I1));
|
||||
@@ -99,6 +110,8 @@ struct GridwiseNormalizationSplitK2nd
|
||||
const XYGammaBetaGridDesc_M_K& gamma_grid_desc_m_k,
|
||||
const XYGammaBetaGridDesc_M_K& beta_grid_desc_m_k,
|
||||
const XYGammaBetaGridDesc_M_K& y_grid_desc_m_k,
|
||||
const SaveMeanInvStdGridDesc_M& save_mean_grid_desc_m,
|
||||
const SaveMeanInvStdGridDesc_M& save_inv_std_grid_desc_m,
|
||||
index_t num_k_mean_var_count_iteration,
|
||||
index_t num_k_block_tile_iteration,
|
||||
index_t k_grid_size,
|
||||
@@ -110,6 +123,8 @@ struct GridwiseNormalizationSplitK2nd
|
||||
const GammaDataType* const __restrict__ p_gamma_global,
|
||||
const BetaDataType* const __restrict__ p_beta_global,
|
||||
YDataType* const __restrict__ p_y_global,
|
||||
SaveMeanInvStdDataType* const __restrict__ p_save_mean_global,
|
||||
SaveMeanInvStdDataType* const __restrict__ p_save_inv_std_global,
|
||||
const YElementwiseOperation y_elementwise_op)
|
||||
{
|
||||
// Thread/Block id
|
||||
@@ -145,6 +160,12 @@ struct GridwiseNormalizationSplitK2nd
|
||||
auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto save_mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_save_mean_global, save_mean_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto save_inv_std_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_save_inv_std_global, save_inv_std_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
// VGPR
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
|
||||
in_mean_thread_buf;
|
||||
@@ -158,6 +179,7 @@ struct GridwiseNormalizationSplitK2nd
|
||||
var_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, int32_t, MThreadSliceSize, true>
|
||||
welford_count_thread_buf;
|
||||
auto& inv_std_thread_buf = var_thread_buf;
|
||||
|
||||
auto x_thread_buf = generate_tuple(
|
||||
[&](auto) {
|
||||
@@ -283,6 +305,42 @@ struct GridwiseNormalizationSplitK2nd
|
||||
thread_k_cluster_id * YDstVectorSize),
|
||||
y_elementwise_op);
|
||||
|
||||
auto threadwise_mean_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
SaveMeanInvStdGridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
SaveMeanInvStdDstVectorSize, // ScalarPerVector
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
save_mean_grid_desc_m,
|
||||
make_multi_index(block_m_cluster_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
auto threadwise_inv_std_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
SaveMeanInvStdGridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
SaveMeanInvStdDstVectorSize, // ScalarPerVector
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
save_inv_std_grid_desc_m,
|
||||
make_multi_index(block_m_cluster_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
// step1: Merge mean and variance
|
||||
constexpr auto mean_var_count_thread_copy_step_I0_k =
|
||||
make_multi_index(I0, KThreadClusterSize);
|
||||
@@ -332,9 +390,33 @@ struct GridwiseNormalizationSplitK2nd
|
||||
|
||||
BlockwiseWelford::Run(
|
||||
mean_thread_buf(I), var_thread_buf(I), welford_count_thread_buf(I));
|
||||
|
||||
inv_std_thread_buf(I) =
|
||||
type_convert<ComputeDataType>(1.0f) / ck::math::sqrt(var_thread_buf(I) + epsilon);
|
||||
});
|
||||
|
||||
// step2: normalization
|
||||
// step2: save mean and inverse std for backward (optional)
|
||||
if(block_k_cluster_id == 0 && thread_k_cluster_id == 0)
|
||||
{
|
||||
if(p_save_mean_global != nullptr)
|
||||
{
|
||||
threadwise_mean_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
mean_thread_buf,
|
||||
save_mean_grid_desc_m,
|
||||
save_mean_global_val_buf);
|
||||
}
|
||||
if(p_save_inv_std_global != nullptr)
|
||||
{
|
||||
threadwise_inv_std_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
inv_std_thread_buf,
|
||||
save_inv_std_grid_desc_m,
|
||||
save_inv_std_global_val_buf);
|
||||
}
|
||||
}
|
||||
|
||||
// step3: normalization
|
||||
constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
|
||||
|
||||
for(index_t k = 0; k < num_k_block_tile_iteration; ++k)
|
||||
@@ -360,7 +442,6 @@ struct GridwiseNormalizationSplitK2nd
|
||||
});
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
|
||||
static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
|
||||
static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
|
||||
constexpr auto offset_m_k =
|
||||
@@ -369,7 +450,7 @@ struct GridwiseNormalizationSplitK2nd
|
||||
// normalize
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
(x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
|
||||
divisor;
|
||||
inv_std_thread_buf(iM);
|
||||
|
||||
// gamma
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
|
||||
@@ -16,9 +16,11 @@ template <typename XDataType,
|
||||
typename GammaDataType,
|
||||
typename BetaDataType,
|
||||
typename YDataType,
|
||||
typename SaveMeanInvStdDataType,
|
||||
typename ComputeDataType,
|
||||
typename YElementwiseOperation,
|
||||
typename GridDesc_M_K,
|
||||
typename GridDesc_M,
|
||||
index_t BlockSize,
|
||||
index_t MThreadClusterSize,
|
||||
index_t KThreadClusterSize,
|
||||
@@ -32,6 +34,7 @@ template <typename XDataType,
|
||||
index_t BetaSrcVectorSize,
|
||||
index_t YDstVectorDim,
|
||||
index_t YDstVectorSize,
|
||||
index_t SaveMeanInvStdDstVectorSize,
|
||||
bool SweepOnce>
|
||||
struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
{
|
||||
@@ -43,6 +46,10 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
(YDstVectorDim == 1 && KThreadSliceSize % YDstVectorSize == 0),
|
||||
"Invalid thread slice sizes and/or vector sizes configuration, please check!");
|
||||
|
||||
static_assert(MThreadSliceSize % SaveMeanInvStdDstVectorSize == 0,
|
||||
"Invalid thread slice sizes and/or save mean and inverse std vector sizes "
|
||||
"configuration, please check!");
|
||||
|
||||
static_assert(XSrcVectorSize == YDstVectorSize);
|
||||
static_assert(XSrcVectorSize == GammaSrcVectorSize);
|
||||
static_assert(XSrcVectorSize == BetaSrcVectorSize);
|
||||
@@ -64,6 +71,10 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
static constexpr auto thread_buffer_desc_m_k = make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{}));
|
||||
|
||||
using ThreadBufferLengths_M = Sequence<MThreadSliceSize>;
|
||||
static constexpr auto thread_buffer_desc_m =
|
||||
make_naive_tensor_descriptor_packed(make_tuple(Number<MThreadSliceSize>{}));
|
||||
|
||||
using ThreadReduceSrcDesc_M_K = decltype(make_naive_tensor_descriptor_packed(
|
||||
make_tuple(Number<MThreadSliceSize>{}, Number<XSrcVectorSize>{})));
|
||||
using ThreadReduceDstDesc_M =
|
||||
@@ -77,6 +88,8 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
ThreadClusterLengths_M_K,
|
||||
ThreadClusterArrangeOrder>;
|
||||
|
||||
using PassThroughOp = tensor_operation::element_wise::PassThrough;
|
||||
|
||||
static constexpr auto I0 = Number<0>{};
|
||||
static constexpr auto I1 = Number<1>{};
|
||||
static constexpr auto I2 = Number<2>{};
|
||||
@@ -114,17 +127,18 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
const GridDesc_M_K& gamma_grid_desc_m_k,
|
||||
const GridDesc_M_K& beta_grid_desc_m_k,
|
||||
const GridDesc_M_K& y_grid_desc_m_k,
|
||||
const GridDesc_M& save_mean_grid_desc_m,
|
||||
const GridDesc_M& save_inv_std_grid_desc_m,
|
||||
index_t num_k_block_tile_iteration,
|
||||
ComputeDataType epsilon,
|
||||
const XDataType* const __restrict__ p_x_global,
|
||||
const GammaDataType* const __restrict__ p_gamma_global,
|
||||
const BetaDataType* const __restrict__ p_beta_global,
|
||||
YDataType* const __restrict__ p_y_global,
|
||||
SaveMeanInvStdDataType* const __restrict__ p_save_mean_global,
|
||||
SaveMeanInvStdDataType* const __restrict__ p_save_inv_std_global,
|
||||
const YElementwiseOperation y_elementwise_op)
|
||||
{
|
||||
auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto x_thread_buf = generate_tuple(
|
||||
[&](auto) {
|
||||
return StaticBuffer<AddressSpaceEnum::Vgpr,
|
||||
@@ -150,6 +164,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
mean_thread_buf;
|
||||
StaticBuffer<AddressSpaceEnum::Vgpr, ComputeDataType, MThreadSliceSize, true>
|
||||
var_thread_buf;
|
||||
auto& inv_std_thread_buf = var_thread_buf;
|
||||
|
||||
const index_t thread_local_id = get_thread_local_1d_id();
|
||||
const index_t block_global_id = get_block_1d_id();
|
||||
@@ -226,6 +241,42 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
thread_k_cluster_id * YDstVectorSize),
|
||||
y_elementwise_op);
|
||||
|
||||
auto threadwise_mean_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
GridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
SaveMeanInvStdDstVectorSize, // ScalarPerVector
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
save_mean_grid_desc_m,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
auto threadwise_inv_std_store =
|
||||
ThreadwiseTensorSliceTransfer_v1r3<ComputeDataType,
|
||||
SaveMeanInvStdDataType,
|
||||
decltype(thread_buffer_desc_m),
|
||||
GridDesc_M,
|
||||
PassThroughOp,
|
||||
ThreadBufferLengths_M,
|
||||
Sequence<0>, // DimAccessOrder
|
||||
0, // SrcVectorDim
|
||||
SaveMeanInvStdDstVectorSize, // ScalarPerVector
|
||||
InMemoryDataOperationEnum::Set,
|
||||
1,
|
||||
true>(
|
||||
save_inv_std_grid_desc_m,
|
||||
make_multi_index(block_global_id * M_BlockTileSize +
|
||||
thread_m_cluster_id * MThreadSliceSize),
|
||||
PassThroughOp{});
|
||||
|
||||
constexpr auto thread_copy_fwd_step_m_k = make_multi_index(0, K_BlockTileStepSize);
|
||||
constexpr auto thread_copy_bwd_step_m_k =
|
||||
make_multi_index(0, SweepOnce ? 0 : -K_BlockTileSize);
|
||||
@@ -239,6 +290,15 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
const auto beta_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_beta_global, beta_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto y_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_y_global, y_grid_desc_m_k.GetElementSpaceSize());
|
||||
|
||||
auto save_mean_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_save_mean_global, save_mean_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto save_inv_std_global_val_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
|
||||
p_save_inv_std_global, save_inv_std_grid_desc_m.GetElementSpaceSize());
|
||||
|
||||
auto threadwise_welford = ThreadwiseWelford();
|
||||
threadwise_welford.max_count_ = GetKPerThread(x_grid_desc_m_k, thread_k_cluster_id);
|
||||
|
||||
@@ -279,10 +339,33 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
|
||||
int count = threadwise_welford.cur_count_;
|
||||
BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
|
||||
inv_std_thread_buf(I) = type_convert<ComputeDataType>(1.0f) /
|
||||
ck::math::sqrt(var_thread_buf(I) + epsilon);
|
||||
});
|
||||
|
||||
// save mean and inverse std for backward (optional)
|
||||
if(thread_k_cluster_id == 0)
|
||||
{
|
||||
if(p_save_mean_global != nullptr)
|
||||
{
|
||||
threadwise_mean_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
mean_thread_buf,
|
||||
save_mean_grid_desc_m,
|
||||
save_mean_global_val_buf);
|
||||
}
|
||||
if(p_save_inv_std_global != nullptr)
|
||||
{
|
||||
threadwise_inv_std_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
inv_std_thread_buf,
|
||||
save_inv_std_grid_desc_m,
|
||||
save_inv_std_global_val_buf);
|
||||
}
|
||||
}
|
||||
|
||||
// normalization
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
|
||||
static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
|
||||
static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
|
||||
constexpr auto offset_m_k =
|
||||
@@ -291,7 +374,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
// normalize
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
(x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
|
||||
divisor;
|
||||
inv_std_thread_buf(iM);
|
||||
|
||||
// gamma & beta
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
@@ -360,8 +443,29 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
|
||||
int count = threadwise_welford.cur_count_;
|
||||
BlockwiseWelford::Run(mean_thread_buf(I), var_thread_buf(I), count);
|
||||
inv_std_thread_buf(I) = 1 / ck::math::sqrt(var_thread_buf(I) + epsilon);
|
||||
});
|
||||
|
||||
if(thread_k_cluster_id == 0)
|
||||
{
|
||||
if(p_save_mean_global != nullptr)
|
||||
{
|
||||
threadwise_mean_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
mean_thread_buf,
|
||||
save_mean_grid_desc_m,
|
||||
save_mean_global_val_buf);
|
||||
}
|
||||
if(p_save_inv_std_global != nullptr)
|
||||
{
|
||||
threadwise_inv_std_store.Run(thread_buffer_desc_m,
|
||||
make_tuple(I0),
|
||||
inv_std_thread_buf,
|
||||
save_inv_std_grid_desc_m,
|
||||
save_inv_std_global_val_buf);
|
||||
}
|
||||
}
|
||||
|
||||
auto thread_copy_tail_m_k =
|
||||
(num_k_block_tile_iteration - 1) * ThreadBufferNumber * thread_copy_fwd_step_m_k;
|
||||
|
||||
@@ -393,7 +497,6 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
});
|
||||
|
||||
static_for<0, MThreadSliceSize, 1>{}([&](auto iM) {
|
||||
auto divisor = 1 / ck::math::sqrt(var_thread_buf(iM) + epsilon);
|
||||
static_for<0, ThreadBufferNumber, 1>{}([&](auto iK0) {
|
||||
static_for<0, XSrcVectorSize, 1>{}([&](auto iK1) {
|
||||
constexpr auto offset_m_k =
|
||||
@@ -402,7 +505,7 @@ struct GridwiseNormalizationWelfordVariance_mk_to_mk
|
||||
// normalize
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
(x_thread_buf(iK0)(Number<offset_m_k>{}) - mean_thread_buf(iM)) *
|
||||
divisor;
|
||||
inv_std_thread_buf(iM);
|
||||
|
||||
// gamma
|
||||
y_thread_buf(iK0)(Number<offset_m_k>{}) =
|
||||
|
||||
Reference in New Issue
Block a user