From 723ccf741e95c03295511c9dc9761659d116fa20 Mon Sep 17 00:00:00 2001 From: AMD-dteng Date: Wed, 8 Jan 2025 17:51:06 +0800 Subject: [PATCH] enable bias feature that add bias before adding residual (for rtpllm project) (#1741) * 1. enable bias feature that add bias before adding residual; 2. change block size from 128->64 when m<64 in fp16 * delete comment * 1.remove fmha change 2.change buffer name from bias to xbias * Now bias can be used independently from fadd * change kbias to kxbias --------- Co-authored-by: feli [ROCm/composable_kernel commit: d5c8a334ca170cdbf80b3ce34a714c94f22324bf] --- example/ck_tile/02_layernorm2d/generate.py | 132 ++++++++++-------- .../02_layernorm2d/layernorm2d_fwd.cpp | 31 +++- .../02_layernorm2d/layernorm2d_fwd.hpp | 3 + .../kernel/layernorm2d_fwd_kernel.hpp | 28 ++++ .../layernorm2d_fwd_pipeline_one_pass.hpp | 20 ++- .../layernorm2d_fwd_pipeline_problem.hpp | 2 + .../layernorm2d_fwd_pipeline_two_pass.hpp | 39 +++++- .../pipeline/layernorm2d_fwd_traits.hpp | 15 ++ 8 files changed, 205 insertions(+), 65 deletions(-) diff --git a/example/ck_tile/02_layernorm2d/generate.py b/example/ck_tile/02_layernorm2d/generate.py index 0581c45970..2cce0042a9 100644 --- a/example/ck_tile/02_layernorm2d/generate.py +++ b/example/ck_tile/02_layernorm2d/generate.py @@ -23,6 +23,10 @@ def get_if_str(idx, total, lase_else = True): else: return 'else if' +XBIAS_ENUM_STR_MAP = [ + 'no', + 'xbias'] # pre-norm add bias + FUSED_ADD_ENUM_STR_MAP = [ 'no', 'pras', # pre-norm @@ -60,6 +64,7 @@ template struct layernorm2d_fwd_traits_ @@ -123,6 +128,7 @@ struct layernorm2d_fwd_traits_ static constexpr bool kFastFDiv = kFastFDiv_; static constexpr bool kWelford = kWelford_; static constexpr bool kTwoPass = kTwoPass_; + static constexpr ck_tile::index_t kXbias = kXbias_; static constexpr ck_tile::index_t kFusedAdd = kFusedAdd_; static constexpr ck_tile::index_t kFusedQuant = kFusedQuant_; }; @@ -141,6 +147,7 @@ template using traits_ = layernorm2d_fwd_traits_; """ @@ -190,10 +198,12 @@ float layernorm2d_fwd_(const S& s, A a) Traits_::kFastFDiv, Traits_::kWelford, Traits_::kTwoPass, + static_cast(Traits_::kXbias), static_cast(Traits_::kFusedAdd), static_cast(Traits_::kFusedQuant)>; using PipelineProblem = ck_tile::Layernorm2dFwdPipelineProblem< typename LayerNormTypeConfig::XDataType, + typename LayerNormTypeConfig::XBiasDataType, typename LayerNormTypeConfig::GammaDataType, typename LayerNormTypeConfig::BetaDataType, typename LayerNormTypeConfig::ComputeDataType, @@ -280,7 +290,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, #include "layernorm2d_fwd_api_common.hpp" // clang-format off -// prec_i prec_o prec_sy rm rn tm tn vn pd mv rpcf welford 2p add sweep +// prec_i prec_o prec_sy rm rn tm tn vn pd mv rpcf welford 2p xbias add sweep {F_instance_def} // clang-format on @@ -290,6 +300,10 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, self.working_path = working_path self.kernel_filter = kernel_filter + class k_xbias_enum(IntEnum): + F_NO_XBIAS = 0 + F_ADD_XBIAS = 1 + class k_fuesd_add_enum(IntEnum): F_NO_ADD = 0 F_PRE_ADD = 1 @@ -305,6 +319,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, F_kPadN : bool F_kSaveMeanInvStd : bool F_kTwoPass : bool + F_kXbias : Any #: layernorm_fwd_codegen.k_bias_enum F_kFusedAdd : Any #: layernorm_fwd_codegen.k_fuesd_add_enum F_kFusedQuant : Any #: layernorm_fwd_codegen.k_fused_sweep_enum @@ -321,6 +336,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, @dataclass class k_problem: F_XDataType : str + F_XBiasDataType : str F_GammaDataType : str F_BetaDataType : str F_ComputeDataType : str @@ -370,6 +386,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, F_kFastFDiv_ : bool F_kWelford_ : bool F_kTwoPass_ : bool + F_kXbias_ : int F_kFusedAdd : int F_kFusedQuant : int @@ -377,7 +394,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, def trait_name(self) ->str: t_ = f'{DATA_TYPE_MAP[self.F_XDataType]}, {DATA_TYPE_MAP[self.F_YDataType]}, {DATA_TYPE_MAP[self.F_XScaleDataType]}, {DATA_TYPE_MAP[self.F_YScaleDataType]}, {self.F_Repeat_M:2}, {self.F_Repeat_N:2}, {self.F_ThreadPerBlock_M:2}, {self.F_ThreadPerBlock_N:4}' t_ += f', {self.F_Vector_N:2}, {BOOL_MAP(self.F_kPadN):5}, {BOOL_MAP(self.F_kSaveMeanInvStd_):5}, {BOOL_MAP(self.F_kFastFDiv_):5}, {BOOL_MAP(self.F_kWelford_):5}' - t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}' + t_ += f', {BOOL_MAP(self.F_kTwoPass_):5}, {self.F_kXbias:4}, {self.F_kFusedAdd:4}, {self.F_kFusedQuant:4}' return t_ # string when calling this kernel @@ -395,6 +412,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, class h_instance: F_DataTypePair : str F_N : str + F_xbias : int F_add : int F_sweep : int instance_list : List[Any] # List[h_traits] @@ -404,6 +422,8 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, prec_i, prec_o = self.F_DataTypePair.split(',') dtype_str = f'{prec_i}' if prec_i == prec_o else f'{prec_i}_{prec_o}' nnn = f'layernorm2d_fwd_{dtype_str}_n{self.F_N}' + if self.F_xbias != 0: + nnn = nnn + '_' + XBIAS_ENUM_STR_MAP[self.F_xbias] if self.F_add != 0: nnn = nnn + '_' + FUSED_ADD_ENUM_STR_MAP[self.F_add] if self.F_sweep != 0: @@ -462,8 +482,8 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, elif ins.F_kFusedQuant == 2: _sweep_cond = 't.fused_quant == {f_fused_sweep} && (t.prec_sy == \"{f_sy_type}\")'.format( f_fused_sweep = ins.F_kFusedQuant, f_sy_type=ins.F_YScaleDataType) - _cond = '((a.n % {f_vec_n} == 0) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format( - f_vec_n = ins.F_Vector_N, f_fused_add = ins.F_kFusedAdd, + _cond = '((a.n % {f_vec_n} == 0) && (t.xbias == {f_xbias}) && (t.fused_add == {f_fused_add}) && ({f_sweep_cond}))'.format( + f_vec_n = ins.F_Vector_N, f_xbias = ins.F_kXbias, f_fused_add = ins.F_kFusedAdd, f_sweep_cond = _sweep_cond) inner_str += self.API_INNER_CASE.format(F_if = get_if_str(idx_in_n, len_in_n, False), F_VEC_COND = _cond, F_instance_func=ins.call_name) @@ -494,62 +514,63 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, types_16bit = ('int16', 'fp16', 'bf16') #fused_add_list = [0, 1, 2] #fused_sweep_list = [0, 1, 2] # NOTE: only single pass can use fused dynamic quant + xbias_list = [0, 1] fused_add_list = [0, 1] fused_sweep_list = [0, 1] # NOTE: only single pass can use fused dynamic quant - # rm rn tm tn vn pd mv fdiv welford 2p add sweep - h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 8, 8, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, True, True, False, 0, 0)], - '128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 1, True, False, True, True, False, 0, 0)], - '256' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 1, True, False, True, True, False, 0, 0)], - '512' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 8, 4, 64, 1, True, False, True, True, False, 0, 0)], - '768' : [ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 4, 64, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 12, 4, 64, 1, True, False, True, True, False, 0, 0)], - '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 2, 128, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 2, 128, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 2, 128, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 1, True, False, True, True, False, 0, 0)], - '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 2, 128, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 1, True, False, True, True, False, 0, 0)], - '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1, 256, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 8, 1, 256, 1, True, False, True, True, False, 0, 0)], - '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 128, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 1, True, False, True, True, False, 0, 0)], - '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, False, 0, 0)], - '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 512, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 6, 1,1024, 1, True, False, True, True, False, 0, 0)], - '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 8, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 512, 4, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 2, True, False, True, True, False, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 8, 1,1024, 1, True, False, True, True, False, 0, 0)], - 'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, True, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, True, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, True, 0, 0), - h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, True, 0, 0)]} + # rm rn tm tn vn pd mv fdiv welford 2p xbias add sweep + h_trait_dict = {'64' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 8, 8, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 1, True, False, True, True, False, 0, 0, 0)], + '128' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 16, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 1, True, False, True, True, False, 0, 0, 0)], + '256' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 1, True, False, True, True, False, 0, 0, 0)], + '512' : [ h_traits('x', 'y', 'xs', 'ys', 1, 1, 4, 64, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 4, 64, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 4, 64, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 4, 64, 1, True, False, True, True, False, 0, 0, 0)], + '768' : [ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 4, 64, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 12, 4, 64, 1, True, False, True, True, False, 0, 0, 0)], + '1024' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 2, 128, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 2, 128, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 2, 128, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 1, True, False, True, True, False, 0, 0, 0)], + '1536' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 4, 64, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 2, 128, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 1, True, False, True, True, False, 0, 0, 0)], + '2048' :[ h_traits('x', 'y', 'xs', 'ys', 1, 1, 1, 256, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 1, 256, 1, True, False, True, True, False, 0, 0, 0)], + '3072' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 128, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1, 256, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 1, True, False, True, True, False, 0, 0, 0)], + '4096' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, False, 0, 0, 0)], + '6144' :[ h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 256, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1, 512, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 3, 1,1024, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 6, 1,1024, 1, True, False, True, True, False, 0, 0, 0)], + '8192' :[ h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 8, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 512, 4, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 2, True, False, True, True, False, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 8, 1,1024, 1, True, False, True, True, False, 0, 0, 0)], + 'big' :[ h_traits('x', 'y', 'xs', 'ys', 1, 2, 1, 256, 8, True, False, True, True, True, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1, 256, 4, True, False, True, True, True, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 2, 1,1024, 2, True, False, True, True, True, 0, 0, 0), + h_traits('x', 'y', 'xs', 'ys', 1, 4, 1,1024, 1, True, False, True, True, True, 0, 0, 0)]} total_blob = list() for hs_key in h_trait_dict: hs = h_trait_dict[hs_key] current_n = hs[0].F_Repeat_N * hs[0].F_ThreadPerBlock_N * hs[0].F_Vector_N - for dtype, scale_type, fused_add, fused_quant in itertools.product(dtype_list, scale_list, fused_add_list, fused_sweep_list): + for dtype, scale_type, xbias, fused_add, fused_quant in itertools.product(dtype_list, scale_list, xbias_list, fused_add_list, fused_sweep_list): prec_i, prec_o = dtype.split(',') scale_x, scale_y = scale_type.split(',') if prec_o in dynamic_quant_out_dtype and fused_quant != 1: @@ -563,6 +584,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, h_.F_YDataType = prec_o h_.F_XScaleDataType = scale_y h_.F_YScaleDataType = scale_x + h_.F_kXbias = xbias h_.F_kFusedAdd = fused_add h_.F_kFusedQuant = fused_quant # disable welford update for 8bit and 16 bit smallN @@ -579,7 +601,7 @@ float layernorm2d_fwd(layernorm2d_fwd_traits t, current_hs.append(h_) # + "\n" #f.write(str(f.parent / GEN_DIR / (blobs.api_common_header_ current_n_str = 'big' if hs_key == 'big' else current_n - total_blob.append(h_instance(dtype, current_n_str, fused_add, fused_quant, current_hs)) + total_blob.append(h_instance(dtype, current_n_str, xbias, fused_add, fused_quant, current_hs)) return total_blob def list_blobs(self, args) -> None: diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp index b49c04619d..87774a1beb 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.cpp @@ -41,6 +41,7 @@ auto create_args(int argc, char* argv[]) .insert("prec_sy", "auto", "output quant scale type, set auto will use fp32. used when fquant=1 or 2") + .insert("xbias", "0", "add bias, 0:no add, 1:add bias before fadd") .insert("fadd", "0", "fused-add, 0:no fused add, 1:preadd+store, 2:preadd only") .insert("fquant", "0", "fused-quant, 0:no, 1:smooth-dynamic-quant, 2:dynamic-quant") .insert("warmup", "5", "cold iter") @@ -93,6 +94,7 @@ bool run(const ck_tile::ArgParser& arg_parser) int do_validation = arg_parser.get_int("v"); int warmup = arg_parser.get_int("warmup"); int repeat = arg_parser.get_int("repeat"); + int xbias = arg_parser.get_int("xbias"); int fused_add = arg_parser.get_int("fadd"); int fused_quant = arg_parser.get_int("fquant"); if(fused_quant == 1 && prec_o != "int8") @@ -107,6 +109,7 @@ bool run(const ck_tile::ArgParser& arg_parser) using XDataType = typename TypeConfig::XDataType; using YDataType = typename TypeConfig::YDataType; + using XBiasDataType = typename TypeConfig::XBiasDataType; using GammaDataType = typename TypeConfig::GammaDataType; using BetaDataType = typename TypeConfig::BetaDataType; using XResidualDataType = XDataType; @@ -121,6 +124,7 @@ bool run(const ck_tile::ArgParser& arg_parser) // host verify ck_tile::HostTensor x_host({m, n}, {x_stride, 1}); + ck_tile::HostTensor x_bias_host({n}); ck_tile::HostTensor gamma_host({n}); ck_tile::HostTensor beta_host({n}); @@ -141,10 +145,12 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::FillUniformDistribution{-.5f, .5f}(x_host); ck_tile::FillUniformDistribution{-.5f, .5f}(x_residual_host); ck_tile::FillUniformDistribution{-1.f, 1.f}(x_scale_host); + ck_tile::FillUniformDistribution{-.5f, .5f}(x_bias_host); ck_tile::FillUniformDistribution{-.5f, .5f}(gamma_host); ck_tile::FillUniformDistribution{-.5f, .5f}(beta_host); ck_tile::DeviceMem x_buf(x_host.get_element_space_size_in_bytes()); + ck_tile::DeviceMem x_bias_buf(x_bias_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem gamma_buf(gamma_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem beta_buf(beta_host.get_element_space_size_in_bytes()); ck_tile::DeviceMem y_buf(y_host_dev.get_element_space_size_in_bytes()); @@ -155,6 +161,7 @@ bool run(const ck_tile::ArgParser& arg_parser) ck_tile::DeviceMem y_residual_buf(y_residual_host.get_element_space_size_in_bytes()); x_buf.ToDevice(x_host.data()); + x_bias_buf.ToDevice(x_bias_host.data()); gamma_buf.ToDevice(gamma_host.data()); beta_buf.ToDevice(beta_host.data()); x_residual_buf.ToDevice(x_residual_host.data()); @@ -179,11 +186,12 @@ bool run(const ck_tile::ArgParser& arg_parser) << ", yr_stride:" << yr_stride << std::flush; layernorm2d_fwd_traits traits{ - prec_i, prec_o, prec_sx, prec_sy, SaveMeanVar, fused_add, fused_quant}; + prec_i, prec_o, prec_sx, prec_sy, SaveMeanVar, xbias, fused_add, fused_quant}; layernorm2d_fwd_args args{x_buf.GetDeviceBuffer(), fused_add != 0 ? x_residual_buf.GetDeviceBuffer() : nullptr, fused_quant == 1 ? x_scale_buf.GetDeviceBuffer() : nullptr, + x_bias_buf.GetDeviceBuffer(), gamma_buf.GetDeviceBuffer(), beta_buf.GetDeviceBuffer(), @@ -210,8 +218,9 @@ bool run(const ck_tile::ArgParser& arg_parser) return false; } - std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(GammaDataType) * n + - sizeof(BetaDataType) * n + sizeof(YDataType) * m * n; + std::size_t num_byte = sizeof(XDataType) * m * n + sizeof(XBiasDataType) * n + + sizeof(GammaDataType) * n + sizeof(BetaDataType) * n + + sizeof(YDataType) * m * n; float gb_per_sec = num_byte / 1.E6 / ave_time; std::cout << ", " << ave_time * 1.E3 << " us, " << gb_per_sec << " GB/s" << std::flush; @@ -221,6 +230,22 @@ bool run(const ck_tile::ArgParser& arg_parser) if(do_validation) { // reference + if(xbias != 0) + { + // add bias before fadd + int M = x_host.mDesc.get_lengths()[0]; + int N = x_host.mDesc.get_lengths()[1]; + for(int idx_m = 0; idx_m < M; ++idx_m) + { + for(int idx_n = 0; idx_n < N; ++idx_n) + { + x_host(idx_m, idx_n) = ck_tile::type_convert( + ck_tile::type_convert(x_host(idx_m, idx_n)) + + ck_tile::type_convert(x_bias_host(idx_n))); + } + } + } + if(fused_add != 0) { // fused pre_add/pre_add_store diff --git a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp index a0f2db0e8a..c126930694 100644 --- a/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp +++ b/example/ck_tile/02_layernorm2d/layernorm2d_fwd.hpp @@ -16,6 +16,7 @@ struct LayerNormTypeConfig; + using XBiasDataType = remove_cvref_t; using GammaDataType = remove_cvref_t; using BetaDataType = remove_cvref_t; using ComputeDataType = remove_cvref_t; @@ -67,6 +69,7 @@ struct Layernorm2dFwd static constexpr bool kPadM = false; // always no need to pad along M static constexpr bool kPadN = Problem::Traits::kPadN; static constexpr bool kTwoPass = Problem::Traits::kTwoPass; + static constexpr auto kXbias = Problem::Traits::kXbias; static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; @@ -82,6 +85,7 @@ struct Layernorm2dFwd const void* p_x; // [m ,n], input, fp16/bf16 const void* p_x_residual; // [m ,n], shortcut input, prec same as input, nullptr if not used const void* p_x_scale; // [1 ,n], smooth scale input, fp32, nullptr if not used + const void* p_x_bias; // [1, n], bias, prec same as input const void* p_gamma; // [1, n], gamma, prec same as input const void* p_beta; // [1, n], beta, prec same as input @@ -108,6 +112,7 @@ struct Layernorm2dFwd return Kargs{hargs.p_x, hargs.p_x_residual, hargs.p_x_scale, + hargs.p_x_bias, hargs.p_gamma, hargs.p_beta, hargs.p_y, @@ -152,6 +157,7 @@ struct Layernorm2dFwd using S_ = typename Problem::BlockShape; auto surfix = [&] () { std::string n; + if (kXbias != Layernorm2dXBiasEnum::NO_BIAS) n += _SS_("_") + Layernorm2dXBiasEnumName::name; if (kFusedAdd != Layernorm2dFusedAddEnum::NO_ADD) n += _SS_("_") + Layernorm2dFusedAddEnumName::name; if (kFusedQuant != Layernorm2dFusedQuantEnum::NO_SWEEP) n += _SS_("_") + Layernorm2dFusedQuantEnumName::name; if (kPadN) n += "_pn"; @@ -228,6 +234,27 @@ struct Layernorm2dFwd } }(); + const auto x_bias_window = [&]() { + if constexpr(kXbias == Layernorm2dXBiasEnum::ADD_BIAS) + { + const auto tmp_ = make_naive_tensor_view( + static_cast(kargs.p_x_bias), + make_tuple(kargs.n), + make_tuple(1), + number{}, + number<1>{}); + + const auto tmp2_ = + pad_tensor_view(tmp_, make_tuple(number{}), sequence{}); + + return make_tile_window(tmp2_, make_tuple(number{}), {0}); + } + else + { + return make_null_tile_window(make_tuple(number{})); + } + }(); + const auto gamma_window = [&]() { const auto tmp_ = make_naive_tensor_view( static_cast(kargs.p_gamma), @@ -371,6 +398,7 @@ struct Layernorm2dFwd Pipeline{}(x_window, x_residual_window, + x_bias_window, gamma_window, beta_window, y_window, diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp index a30a9256ab..4967c05096 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_one_pass.hpp @@ -18,6 +18,7 @@ struct Layernorm2dFwdPipelineOnePass using Policy = ck_tile::remove_cvref_t; using XDataType = ck_tile::remove_cvref_t; + using XBiasDataType = ck_tile::remove_cvref_t; using GammaDataType = ck_tile::remove_cvref_t; using BetaDataType = ck_tile::remove_cvref_t; using ComputeDataType = ck_tile::remove_cvref_t; @@ -38,6 +39,7 @@ struct Layernorm2dFwdPipelineOnePass static constexpr bool kPadN = Problem::Traits::kPadN; static constexpr bool kFastFDiv = Problem::Traits::kFastFDiv; static constexpr bool kWelford = Problem::Traits::kWelford; + static constexpr auto kXbias = Problem::Traits::kXbias; static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; @@ -55,6 +57,7 @@ struct Layernorm2dFwdPipelineOnePass template CK_TILE_DEVICE auto operator()(const XWindow& x_window_, const XResidualWindow& x_residual_window_, + const XBiasWindow& x_bias_window_, const GammaWindow& gamma_window_, const BetaWindow& beta_window_, YWindow& y_window_, @@ -81,6 +85,8 @@ struct Layernorm2dFwdPipelineOnePass { const auto x_window = make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); + const auto x_bias_window = make_tile_window( + x_bias_window_, Policy::template MakeGammaBetaBlockTileDistribution()); const auto gamma_window = make_tile_window( gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution()); const auto beta_window = make_tile_window( @@ -90,8 +96,9 @@ struct Layernorm2dFwdPipelineOnePass auto y_residual_window = make_tile_window( y_residual_window_, Policy::template MakeXBlockTileDistribution()); - auto x = load_tile(x_window); - auto x_resi = load_tile(x_residual_window); + auto x = load_tile(x_window); + auto x_resi = load_tile(x_residual_window); + const auto x_bias = load_tile(x_bias_window); int cur_count = 0; int max_count = @@ -112,6 +119,15 @@ struct Layernorm2dFwdPipelineOnePass auto acc = cast_tile(x); + if constexpr(kXbias == Layernorm2dXBiasEnum::ADD_BIAS) + { + sweep_tile(x, [&](auto idx) { + // compute x = bias + x + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + acc(idx) = type_convert(x_bias[j_idx]) + acc(idx); + }); + } + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) { diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp index 17ff80f471..eb1d8c91a4 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_problem.hpp @@ -8,6 +8,7 @@ namespace ck_tile { template ; + using XBiasDataType = remove_cvref_t; using GammaDataType = remove_cvref_t; using BetaDataType = remove_cvref_t; using ComputeDataType = remove_cvref_t; diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp index 4a37be8776..10556d36dd 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_pipeline_two_pass.hpp @@ -17,6 +17,7 @@ struct Layernorm2dFwdPipelineTwoPass using Policy = ck_tile::remove_cvref_t; using XDataType = ck_tile::remove_cvref_t; + using XBiasDataType = ck_tile::remove_cvref_t; using GammaDataType = ck_tile::remove_cvref_t; using BetaDataType = ck_tile::remove_cvref_t; using ComputeDataType = ck_tile::remove_cvref_t; @@ -37,6 +38,7 @@ struct Layernorm2dFwdPipelineTwoPass static constexpr bool kPadN = Problem::Traits::kPadN; static constexpr bool kFastFDiv = Problem::Traits::kFastFDiv; static constexpr bool kWelford = Problem::Traits::kWelford; + static constexpr auto kXbias = Problem::Traits::kXbias; static constexpr auto kFusedAdd = Problem::Traits::kFusedAdd; static constexpr auto kFusedQuant = Problem::Traits::kFusedQuant; @@ -54,6 +56,7 @@ struct Layernorm2dFwdPipelineTwoPass template CK_TILE_DEVICE auto operator()(const XWindow& x_window_, const XResidualWindow& x_residual_window_, + const XBiasWindow& x_bias_window_, const GammaWindow& gamma_window_, const BetaWindow& beta_window_, YWindow& y_window, @@ -81,6 +85,8 @@ struct Layernorm2dFwdPipelineTwoPass static_assert(kWelford == true, "2 pass only supports welford merge"); auto x_window = make_tile_window(x_window_, Policy::template MakeXBlockTileDistribution()); + auto x_bias_window = make_tile_window( + x_bias_window_, Policy::template MakeGammaBetaBlockTileDistribution()); auto gamma_window = make_tile_window( gamma_window_, Policy::template MakeGammaBetaBlockTileDistribution()); auto beta_window = make_tile_window( @@ -115,13 +121,24 @@ struct Layernorm2dFwdPipelineTwoPass for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { - auto x = load_tile(x_window); - auto x_resi = load_tile(x_residual_window); + auto x = load_tile(x_window); + auto x_resi = load_tile(x_residual_window); + const auto x_bias = load_tile(x_bias_window); move_tile_window(x_window, {0, Block_N}); move_tile_window(x_residual_window, {0, Block_N}); + move_tile_window(x_bias_window, {Block_N}); auto acc = cast_tile(x); + if constexpr(kXbias == Layernorm2dXBiasEnum::ADD_BIAS) + { + sweep_tile(x, [&](auto idx) { + // compute x = bias + x + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + acc(idx) = type_convert(x_bias[j_idx]) + acc(idx); + }); + } + if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) { @@ -167,6 +184,7 @@ struct Layernorm2dFwdPipelineTwoPass move_tile_window(x_window, {0, -Block_N}); move_tile_window(x_residual_window, {0, -Block_N}); + move_tile_window(x_bias_window, {-Block_N}); move_tile_window(gamma_window, {stride_to_right_most_window}); move_tile_window(beta_window, {stride_to_right_most_window}); move_tile_window(y_window, {0, stride_to_right_most_window}); @@ -174,9 +192,19 @@ struct Layernorm2dFwdPipelineTwoPass // layernorm computation for(int iN = __builtin_amdgcn_readfirstlane(0); iN < num_n_tile_iteration; ++iN) { - auto x = load_tile(x_window); - auto x_resi = load_tile(x_residual_window); - auto acc = cast_tile(x); + auto x = load_tile(x_window); + auto x_resi = load_tile(x_residual_window); + const auto x_bias = load_tile(x_bias_window); + auto acc = cast_tile(x); + + if constexpr(kXbias == Layernorm2dXBiasEnum::ADD_BIAS) + { + sweep_tile(x, [&](auto idx) { + // compute x = bias + x + constexpr auto j_idx = make_tuple(idx[number<1>{}]); + acc(idx) = type_convert(x_bias[j_idx]) + acc(idx); + }); + } if constexpr(kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD_STORE || kFusedAdd == Layernorm2dFusedAddEnum::PRE_ADD) @@ -209,6 +237,7 @@ struct Layernorm2dFwdPipelineTwoPass move_tile_window(x_window, {0, -Block_N}); move_tile_window(x_residual_window, {0, -Block_N}); + move_tile_window(x_bias_window, {-Block_N}); move_tile_window(gamma_window, {-Block_N}); move_tile_window(beta_window, {-Block_N}); move_tile_window(y_window, {0, -Block_N}); diff --git a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp index 045bd24e49..189ca9fe80 100644 --- a/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp +++ b/include/ck_tile/ops/layernorm2d/pipeline/layernorm2d_fwd_traits.hpp @@ -7,6 +7,19 @@ namespace ck_tile { +enum class Layernorm2dXBiasEnum +{ + NO_BIAS = 0, + // add bias before fused add + ADD_BIAS = 1, +}; + +// clang-format off +template struct Layernorm2dXBiasEnumName; +template<> struct Layernorm2dXBiasEnumName { static constexpr const char * name = "no"; }; +template<> struct Layernorm2dXBiasEnumName { static constexpr const char * name = "xbias"; }; +// clang-format on + enum class Layernorm2dFusedAddEnum { NO_ADD = 0, @@ -42,6 +55,7 @@ template struct Layernorm2dFwdTraits @@ -51,6 +65,7 @@ struct Layernorm2dFwdTraits static constexpr bool kFastFDiv = kFastFDiv_; static constexpr bool kWelford = kWelford_; static constexpr bool kTwoPass = kTwoPass_; + static constexpr Layernorm2dXBiasEnum kXbias = kXbias_; static constexpr Layernorm2dFusedAddEnum kFusedAdd = kFusedAdd_; static constexpr Layernorm2dFusedQuantEnum kFusedQuant = kFusedQuant_; };