Refine layernorm naming and test code (#497)

* Sync the naming

* Sync the test of layernorm with groupnorm

* Sync the naming

* Minor change for comment and log

* [What] Add saveMean and SaveInvVariance in the interface.
[Why] These can optimize the backward
This commit is contained in:
rocking5566
2022-11-03 06:57:28 +08:00
committed by GitHub
parent 451f1e3d65
commit d4d1147f0a
15 changed files with 207 additions and 311 deletions

View File

@@ -33,6 +33,8 @@ struct DeviceNormalization : public BaseOperator
const void* p_gamma,
const void* p_beta,
void* p_y,
void* p_savedMean,
void* p_savedInvVar,
AccElementwiseOperation acc_elementwise_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;

View File

@@ -10,7 +10,7 @@
#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_layernorm_welford_variance.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_normalization_welford_variance.hpp"
#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp"
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
@@ -24,17 +24,17 @@ template <typename GridwiseReduction,
typename AccDataType,
typename AccElementwiseOperation,
typename GridDesc_M_K>
__global__ void kernel_layernorm(const GridDesc_M_K x_grid_desc_m_k,
const GridDesc_M_K gamma_grid_desc_m_k,
const GridDesc_M_K beta_grid_desc_m_k,
const GridDesc_M_K y_grid_desc_m_k,
index_t num_k_block_tile_iteration,
AccDataType epsilon,
const XDataType* const __restrict__ p_x_global,
const GammaDataType* const __restrict__ p_gamma_global,
const BetaDataType* const __restrict__ p_beta_global,
YDataType* const __restrict__ p_y_global,
const AccElementwiseOperation acc_elementwise_op)
__global__ void kernel_normalization(const GridDesc_M_K x_grid_desc_m_k,
const GridDesc_M_K gamma_grid_desc_m_k,
const GridDesc_M_K beta_grid_desc_m_k,
const GridDesc_M_K y_grid_desc_m_k,
index_t num_k_block_tile_iteration,
AccDataType epsilon,
const XDataType* const __restrict__ p_x_global,
const GammaDataType* const __restrict__ p_gamma_global,
const BetaDataType* const __restrict__ p_beta_global,
YDataType* const __restrict__ p_y_global,
const AccElementwiseOperation acc_elementwise_op)
{
GridwiseReduction::Run(x_grid_desc_m_k,
gamma_grid_desc_m_k,
@@ -54,7 +54,7 @@ namespace ck {
namespace tensor_operation {
namespace device {
// Y = LayerNorm(X, Beta, Gamma)
// Y = Normalization(X, Beta, Gamma)
template <typename XDataType,
typename GammaDataType,
typename BetaDataType,
@@ -168,49 +168,49 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1));
using GridwiseReduceLayernormGeneric =
GridwiseLayernormWelfordVariance_mk_to_mk<XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
AccElementwiseOperation,
GridDesc_M_K,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
XYSrcVectorDim,
XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize,
XYSrcVectorDim,
YDstVectorSize,
false>;
using GridwiseReduceLayernormSweepOnce =
GridwiseLayernormWelfordVariance_mk_to_mk<XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
AccElementwiseOperation,
GridDesc_M_K,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
XYSrcVectorDim,
XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize,
XYSrcVectorDim,
YDstVectorSize,
true>;
GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
AccElementwiseOperation,
GridDesc_M_K,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
XYSrcVectorDim,
XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize,
XYSrcVectorDim,
YDstVectorSize,
false>;
using GridwiseNormalizationSweepOnce =
GridwiseNormalizationWelfordVariance_mk_to_mk<XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
AccElementwiseOperation,
GridDesc_M_K,
BlockSize,
MThreadClusterSize,
KThreadClusterSize,
MThreadSliceSize,
KThreadSliceSize,
XYSrcVectorDim,
XSrcVectorSize,
GammaSrcVectorDim,
GammaSrcVectorSize,
BetaSrcVectorDim,
BetaSrcVectorSize,
XYSrcVectorDim,
YDstVectorSize,
true>;
struct Argument : public BaseArgument
{
@@ -295,22 +295,22 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{})
{
const auto kernel_main = arg.isSweeponce_
? kernel_layernorm<GridwiseReduceLayernormSweepOnce,
XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
AccElementwiseOperation,
GridDesc_M_K>
: kernel_layernorm<GridwiseReduceLayernormGeneric,
XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
AccElementwiseOperation,
GridDesc_M_K>;
? kernel_normalization<GridwiseNormalizationSweepOnce,
XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
AccElementwiseOperation,
GridDesc_M_K>
: kernel_normalization<GridwiseReduceLayernormGeneric,
XDataType,
GammaDataType,
BetaDataType,
YDataType,
AccDataType,
AccElementwiseOperation,
GridDesc_M_K>;
float avg_time = 0;
avg_time += launch_and_time_kernel(stream_config,
@@ -426,8 +426,16 @@ struct DeviceNormalizationImpl : public DeviceNormalization<XDataType,
const void* p_gamma,
const void* p_beta,
void* p_y,
void* p_saveMean,
void* p_saveInvVar,
AccElementwiseOperation acc_elementwise_op) override
{
// TODO
// Optional cache of the intermediate results (mean and InvVariance) during the
// forward pass could speedup in the backward
ignore = p_saveMean;
ignore = p_saveInvVar;
return std::make_unique<Argument>(lengths,
xStrides,
gammaStrides,

View File

@@ -14,7 +14,7 @@
namespace ck {
// Y = LayerNorm(X, Beta, Gamma)
// Y = Normalization(X, Beta, Gamma)
template <typename XDataType,
typename GammaDataType,
typename BetaDataType,
@@ -36,7 +36,7 @@ template <typename XDataType,
index_t YDstVectorDim,
index_t YDstVectorSize,
bool SweepOnce>
struct GridwiseLayernormNaiveVariance_mk_to_mk
struct GridwiseNormalizationNaiveVariance_mk_to_mk
{
static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
(XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),

View File

@@ -11,7 +11,7 @@
namespace ck {
// Y = LayerNorm(X, Beta, Gamma)
// Y = Normalization(X, Beta, Gamma)
template <typename XDataType,
typename GammaDataType,
typename BetaDataType,
@@ -33,7 +33,7 @@ template <typename XDataType,
index_t YDstVectorDim,
index_t YDstVectorSize,
bool SweepOnce>
struct GridwiseLayernormWelfordVariance_mk_to_mk
struct GridwiseNormalizationWelfordVariance_mk_to_mk
{
static_assert((XSrcVectorDim == 0 && MThreadSliceSize % XSrcVectorSize == 0) ||
(XSrcVectorDim == 1 && KThreadSliceSize % XSrcVectorSize == 0),