From a05c4ff96c936bb9b490c9e73ac405f7443e4b0c Mon Sep 17 00:00:00 2001 From: rocking Date: Tue, 10 Dec 2024 11:36:18 +0800 Subject: [PATCH] [CK TILE] Use config name instead of data type in FmhaFwdTypeConfig (#1731) * Add data type config, Prepare to add mix precision in the future * Fix compile error [ROCm/composable_kernel commit: 94ae7113bd05e3c39364193dba1b391a4c54a2f4] --- .../ck_tile/01_fmha/codegen/cpp_symbol_map.py | 15 ++- .../ck_tile/01_fmha/codegen/ops/fmha_bwd.py | 14 +-- .../ck_tile/01_fmha/codegen/ops/fmha_fwd.py | 11 ++- .../01_fmha/codegen/ops/fmha_fwd_appendkv.py | 9 +- .../01_fmha/codegen/ops/fmha_fwd_splitkv.py | 27 ++--- example/ck_tile/01_fmha/fmha_bwd.cpp | 14 +-- example/ck_tile/01_fmha/fmha_bwd.hpp | 12 ++- example/ck_tile/01_fmha/fmha_fwd.cpp | 99 ++++++++++--------- example/ck_tile/01_fmha/fmha_fwd.hpp | 32 +++++- 9 files changed, 142 insertions(+), 91 deletions(-) diff --git a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py index 66691356ab..f6df44a318 100644 --- a/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py +++ b/example/ck_tile/01_fmha/codegen/cpp_symbol_map.py @@ -2,10 +2,17 @@ # Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. # generate kernel instances to speed up compilation -DTYPE_MAP = { - "fp16": "ck_tile::fp16_t", - "bf16": "ck_tile::bf16_t", - "fp8" : "ck_tile::fp8_t" +FWD_DTYPE_MAP = { + "fp16" : "FmhaFwdFp16", + "bf16" : "FmhaFwdBf16", + "fp8" : "FmhaFwdFp8", + "fp8fp16": "FmhaFwdFp8Fp16", + "fp8bf16": "FmhaFwdFp8Bf16" +} + +BWD_DTYPE_MAP = { + "fp16": "FmhaBwdFp16", + "bf16": "FmhaBwdBf16" } MASK_IMPL = { diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py index 096394c0c9..83a1e82d6d 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_bwd.py @@ -283,7 +283,7 @@ class FmhaBwdApiPool: inners = inners + FMHA_BWD_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_pipeline_enum=BWD_DQDKDV_PIPELINE_ENUM_MAP[trait.pipeline], F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias], F_dbias=BOOL_MAP[trait.dbias], F_dropout_check=DROPOUT_CHECK_MAP[trait.dropout], F_dropout=DROPOUT_MAP[trait.dropout], - F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype], + F_scheck=trait.scheck(spad1=spad1), F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_hdim=hdim, F_dtype=BWD_DTYPE_MAP[dtype], F_spad0=BOOL_MAP[trait.spad], F_spad1=BOOL_MAP[spad1], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], F_deterministic=BOOL_MAP[trait.deterministic]) @@ -360,7 +360,7 @@ class FmhaBwdDQDKDVKernel: FMHA_BWD_DQ_DK_DV_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = BWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn0 = self.F_tile.F_bn0, F_bk0 = self.F_tile.F_bk0, @@ -469,7 +469,7 @@ def get_bwd_dq_dk_dv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> gen = list() api_pool = FmhaBwdApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in BWD_DTYPE_MAP.keys(): d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d == None: continue @@ -585,7 +585,7 @@ class FmhaBwdOGradDotOKernel: FMHA_BWD_DOT_DO_O_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = BWD_DTYPE_MAP[self.F_dtype], F_spad = BOOL_MAP[self.F_spad], F_dvpad = BOOL_MAP[self.F_dvpad], F_mode = MODE_MAP[self.F_mode], @@ -616,7 +616,7 @@ def get_bwd_dot_do_o_blobs() -> List[FmhaBwdOGradDotOKernel]: gen = list() - for dtype in DTYPE_MAP.keys(): + for dtype in BWD_DTYPE_MAP.keys(): d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d == None: continue @@ -716,7 +716,7 @@ class FmhaBwdConvertQGradKernel: FMHA_BWD_CONVERT_DQ_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = BWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_bm0, F_bn0 = self.F_bn0, F_spad = BOOL_MAP[self.F_spad], @@ -751,7 +751,7 @@ def get_bwd_convert_dq_blobs() -> List[FmhaBwdConvertQGradKernel]: gen = list() - for dtype in DTYPE_MAP.keys(): + for dtype in BWD_DTYPE_MAP.keys(): d = get_fmha_bwd_dq_dk_dv_tile_ppl_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py index e5ee1d22e7..eca638784d 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd.py @@ -282,7 +282,7 @@ class FmhaFwdApiPool: F_squant=BOOL_MAP[trait.squant], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max, - F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) + F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) if_i = 'if' if i == 0 else 'else if' @@ -301,7 +301,7 @@ class FmhaFwdTileSize: F_bk1 : int # tile size along kv gemm unroll F_bk0max : int # total length of K0, used for pipeline that need load Q at once (or repeately load Q as a whole tile) F_rm0 : int # number of warps for gemm0 along q seqlen - F_rn0 : int # number of warps for gemm0 along k seqlen + F_rn0 : int # number of warps for gemm0 along k seqlen F_rk0 : int # number of warps for gemm0 along head dim q (not used) F_rm1 : int # number of warps for gemm1 along q seqlen F_rn1 : int # number of warps for gemm1 along head dim v @@ -339,7 +339,7 @@ class FmhaFwdKernel: FMHA_FWD_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn0 = self.F_tile.F_bn0, F_bk0 = self.F_tile.F_bk0, @@ -462,6 +462,9 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm # no need lse/dropout kernels for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): pipelines.append(FmhaFwdPipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 'f', 'f', squant, mask)) + elif dtype in ['fp8fp16', 'fp8bf16']: + # TODO + None else: assert False return pipelines @@ -469,7 +472,7 @@ def get_fwd_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> Tuple[Fm gen = list() api_pool = FmhaFwdApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_tile_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py index cfd1d01c91..fb998a33d7 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_appendkv.py @@ -181,7 +181,7 @@ class FmhaFwdAppendKVApiPool: inners = inners + FMHA_FWD_APPENDKV_API_INNER_DISPATCH.format(F_if=if_k, F_vlayout=LAYOUT_MAP[trait.vlayout], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_rope_check=ROPE_CHECK_MAP[trait.rope], F_pagedkv=BOOL_MAP[trait.pagedkv], F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], - F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) + F_rope=ROPE_MAP[trait.rope], F_bs=trait.bs, F_bsk=trait.bsk, F_bd=trait.bd, F_bdv=trait.bdv, F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) if_i = 'if' if i == 0 else 'else if' @@ -216,7 +216,7 @@ class FmhaFwdAppendKVKernel: FMHA_FWD_APPENDKV_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bs = self.F_tile.F_bs, F_bsk = self.F_tile.F_bsk, F_bd = self.F_tile.F_bd, @@ -301,6 +301,9 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> elif dtype in ['fp8', 'bf8']: # rope/paged-kv is not supported pipelines.append(FmhaFwdAppendKVPipeline('col', 't', 't', 't', 't', 'no', 'f')) + elif dtype in ['fp8fp16', 'fp8bf16']: + # TODO + None else: assert False return pipelines @@ -308,7 +311,7 @@ def get_fwd_appendkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> gen = list() api_pool = FmhaFwdAppendKVApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_appendkv_tile_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py index 1c40cf6f31..e448902cf8 100644 --- a/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py +++ b/example/ck_tile/01_fmha/codegen/ops/fmha_fwd_splitkv.py @@ -112,7 +112,7 @@ static void run(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a) }} using trait_{F_idx} = fmha_fwd_splitkv_traits_<{F_hdim}, {F_dtype}, {F_mode}, {F_bm0}, {F_bn0}, {F_bk0}, {F_bn1}, {F_bk1}, {F_bk0max}, {F_vlayout}, - {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, + {F_pipeline_enum}, fmha_mask_{F_idx}, {F_bias}, {F_lse}, {F_squant}, {F_pagedkv}, {F_spad}, {F_skpad}, {F_dpad}, {F_dvpad}>; #include @@ -161,7 +161,7 @@ using fmha_pipeline_problem = ck_tile::BlockFmhaSplitKVCombinePipelineProblem< typename FmhaFwdTypeConfig::OaccDataType, typename FmhaFwdTypeConfig::ODataType, {F_hdim}, - {F_bm0}, + {F_bm0}, {F_bn1}, {F_mode}, fmha_trait>; @@ -231,11 +231,11 @@ float fmha_fwd_splitkv_(const ck_tile::stream_config& s, fmha_fwd_splitkv_args a if(s.log_level_ > 0) std::cout << ", " << fmha_fwd_splitkv_get_name_() - << ", " << fmha_fwd_splitkv_combine_get_name_() + << ", " << fmha_fwd_splitkv_combine_get_name_() << std::flush; return ck_tile::launch_kernel(s, - [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_oneshot_(s_, a); }}, + [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_oneshot_(s_, a); }}, [=](const ck_tile::stream_config& s_){{ fmha_fwd_splitkv_combine_oneshot_(s_, a); }} ); }} @@ -431,11 +431,11 @@ class FmhaFwdSplitKVApiPool: inners = inners + FMHA_FWD_SPLITKV_API_INNER_DISPATCH.format(F_if=if_k, F_mode=MODE_MAP[trait.mode], F_vlayout=LAYOUT_MAP[trait.vlayout], F_pipeline_enum=PIPELINE_ENUM_MAP[trait.pipeline_tag], F_mask=get_mask_map(self.mask_impl)[trait.mask], F_mask_check=get_mask_check_map(self.mask_impl)[trait.mask], F_bias_check=BIAS_CHECK_MAP[trait.bias], F_bias=BIAS_MAP[trait.bias], - F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], + F_lse=BOOL_MAP[trait.lse], F_squant=BOOL_MAP[trait.squant], F_pagedkv=BOOL_MAP[trait.pagedkv], F_scheck=trait.scheck, F_skcheck=trait.skcheck, F_dcheck=trait.dcheck, F_dvcheck=trait.dvcheck, F_spad=BOOL_MAP[trait.spad], F_skpad=BOOL_MAP[trait.skpad], F_dpad=BOOL_MAP[trait.dpad], F_dvpad=BOOL_MAP[trait.dvpad], F_bm0=trait.bm0, F_bn0=trait.bn0, F_bk0=trait.bk0, F_bn1=trait.bn1, F_bk1=trait.bk1, F_bk0max=trait.bk0max, - F_hdim=hdim, F_dtype=DTYPE_MAP[dtype]) + F_hdim=hdim, F_dtype=FWD_DTYPE_MAP[dtype]) if_j = 'if' if j == 0 else 'else if' per_hdim_case = per_hdim_case + FMHA_FWD_API_PER_HDIM_CASE.format(F_if=if_j, F_hdim=hdim, F_inner_dispatch=inners) if_i = 'if' if i == 0 else 'else if' @@ -472,7 +472,7 @@ class FmhaFwdSplitKVKernel: FMHA_FWD_SPLITKV_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn0 = self.F_tile.F_bn0, F_bk0 = self.F_tile.F_bk0, @@ -492,7 +492,7 @@ class FmhaFwdSplitKVKernel: F_spad = BOOL_MAP[self.F_pipeline.F_spad], F_skpad = BOOL_MAP[self.F_pipeline.F_skpad], F_dpad = BOOL_MAP[self.F_pipeline.F_dpad], - F_dvpad = BOOL_MAP[self.F_pipeline.F_dvpad], + F_dvpad = BOOL_MAP[self.F_pipeline.F_dvpad], F_bias = BIAS_MAP[self.F_pipeline.F_bias], F_lse = BOOL_MAP[self.F_pipeline.F_lse], F_squant = BOOL_MAP[self.F_pipeline.F_squant], @@ -552,7 +552,7 @@ class FmhaFwdSplitKVCombineKernel: FMHA_FWD_SPLITKV_COMBINE_KERNEL_BODY.format( F_idx = self.F_idx, F_hdim = self.F_hdim, - F_dtype = DTYPE_MAP[self.F_dtype], + F_dtype = FWD_DTYPE_MAP[self.F_dtype], F_bm0 = self.F_tile.F_bm0, F_bn1 = self.F_tile.F_bn1, F_spad = BOOL_MAP[self.F_pipeline.F_spad], @@ -625,7 +625,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> pipelines = [] if dtype in ['fp16', 'bf16']: for mask, bias, pagedkv in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys(), ["t", "f"]): - # TODO: use async pipeline when compiler is more stable + # TODO: use async pipeline when compiler is more stable if hdim == 256 or hdim in [32, 64, 128]: ### [32, 64, 96, 128]: # if True: pipelines.append(Pipeline('qr', 'row', 'f', 't', 'f', 'f', bias, 't', squant, pagedkv, mask)) @@ -644,6 +644,9 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> elif dtype in ['fp8', 'bf8']: for mask, bias in itertools.product(get_mask_map(mask_impl).keys(), BIAS_MAP.keys()): pipelines.append(Pipeline('qr', 'col', 'f', 'f', 'f', 'f', bias, 't', squant, 'f', mask)) + elif dtype in ['fp8fp16', 'fp8bf16']: + # TODO + None else: assert False return pipelines @@ -651,7 +654,7 @@ def get_fwd_splitkv_blobs(kernel_filter : Optional[str], receipt, mask_impl) -> gen = list() api_pool = FmhaFwdSplitKVApiPool(mask_impl) - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_tile_dict_from_dtype(dtype) if d == None: continue @@ -711,7 +714,7 @@ def get_fwd_splitkv_combine_blobs(kernel_filter : Optional[str], receipt) -> Lis gen = list() - for dtype in DTYPE_MAP.keys(): + for dtype in FWD_DTYPE_MAP.keys(): d = get_fmha_fwd_splitkv_combine_tile_dict_from_dtype(dtype) if d == None: continue diff --git a/example/ck_tile/01_fmha/fmha_bwd.cpp b/example/ck_tile/01_fmha/fmha_bwd.cpp index 2d76627a72..eaf99529f3 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.cpp +++ b/example/ck_tile/01_fmha/fmha_bwd.cpp @@ -101,7 +101,7 @@ auto create_args(int argc, char* argv[]) } // different threshold for different dtype -template +template auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/) { double rtol = 1e-2; @@ -110,7 +110,7 @@ auto get_elimit(ck_tile::index_t /*hdim_q*/, ck_tile::index_t /*hdim_v*/) } template <> -auto get_elimit(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v) +auto get_elimit(ck_tile::index_t hdim_q, ck_tile::index_t hdim_v) { double rtol = 1e-2; double atol = 1e-2; @@ -122,7 +122,7 @@ auto get_elimit(ck_tile::index_t hdim_q, ck_tile::index_t hdim_ return ck_tile::make_tuple(rtol, atol); } -template +template bool run(const ck_tile::ArgParser& arg_parser) { std::string data_type = arg_parser.get_str("prec"); @@ -209,7 +209,7 @@ bool run(const ck_tile::ArgParser& arg_parser) const auto seqstart_q_host = generate_seqstarts(mode, batch, seqlen_q); const auto seqstart_k_host = generate_seqstarts(mode, batch, seqlen_k); - using TypeConfig = FmhaBwdTypeConfig; + using TypeConfig = FmhaBwdTypeConfig; using QDataType = typename TypeConfig::QDataType; using KDataType = typename TypeConfig::KDataType; @@ -933,7 +933,7 @@ bool run(const ck_tile::ArgParser& arg_parser) } // clang-format on - auto [rtol, atol] = get_elimit(hdim_q, hdim_v); + auto [rtol, atol] = get_elimit(hdim_q, hdim_v); bool dq_cur_pass = ck_tile::check_err(dq_host_result, dq_host_ref, std::string("Error: QGrad Incorrect results!"), @@ -986,11 +986,11 @@ int main(int argc, char* argv[]) const std::string data_type = arg_parser.get_str("prec"); if(data_type == "fp16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } else if(data_type == "bf16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } return -3; diff --git a/example/ck_tile/01_fmha/fmha_bwd.hpp b/example/ck_tile/01_fmha/fmha_bwd.hpp index 722ef15a2f..6204cbcfa8 100644 --- a/example/ck_tile/01_fmha/fmha_bwd.hpp +++ b/example/ck_tile/01_fmha/fmha_bwd.hpp @@ -14,11 +14,19 @@ #include #include +struct FmhaBwdFp16 +{ +}; + +struct FmhaBwdBf16 +{ +}; + template struct FmhaBwdTypeConfig; template <> -struct FmhaBwdTypeConfig +struct FmhaBwdTypeConfig { using QDataType = ck_tile::half_t; using KDataType = ck_tile::half_t; @@ -38,7 +46,7 @@ struct FmhaBwdTypeConfig }; template <> -struct FmhaBwdTypeConfig +struct FmhaBwdTypeConfig { using QDataType = ck_tile::bf16_t; using KDataType = ck_tile::bf16_t; diff --git a/example/ck_tile/01_fmha/fmha_fwd.cpp b/example/ck_tile/01_fmha/fmha_fwd.cpp index 1f0d73d950..ebf2c93a33 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.cpp +++ b/example/ck_tile/01_fmha/fmha_fwd.cpp @@ -142,7 +142,7 @@ auto create_args(int argc, char* argv[]) } // different threshold for different dtype -template +template auto get_elimit(std::string /*init_method*/) { double rtol = 1e-3; @@ -151,7 +151,7 @@ auto get_elimit(std::string /*init_method*/) } template <> -auto get_elimit(std::string /*init_method*/) +auto get_elimit(std::string /*init_method*/) { double rtol = 1e-2; double atol = 1e-2; @@ -159,7 +159,7 @@ auto get_elimit(std::string /*init_method*/) } template <> -auto get_elimit(std::string init_method) +auto get_elimit(std::string init_method) { if(init_method == "ui" || init_method == "ni") { @@ -261,7 +261,7 @@ int override_num_splits_if_necessary( return num_splits; } -template +template bool run(const ck_tile::ArgParser& arg_parser) { std::string data_type = arg_parser.get_str("prec"); @@ -305,8 +305,8 @@ bool run(const ck_tile::ArgParser& arg_parser) } ck_tile::index_t rotary_dim = arg_parser.get_int("rotary_dim"); - if constexpr(!(std::is_same_v || - std::is_same_v)) + if constexpr(!(std::is_same_v || + std::is_same_v)) { if(0 < rotary_dim) { @@ -428,25 +428,6 @@ bool run(const ck_tile::ArgParser& arg_parser) return atoi(squant_str.c_str()) != 0 ? true : false; }(); - float range_q = arg_parser.get_float("range_q"); - float range_k = arg_parser.get_float("range_k"); - float range_v = arg_parser.get_float("range_v"); - float range_p = arg_parser.get_float("range_p"); - float range_o = arg_parser.get_float("range_o"); - - float dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); - - float scale_p = 1.f; - float scale_o = 1.f; - - if(squant) - { - scale_s = scale_s * (range_q / dtype_max) * (range_k / dtype_max); - scale_p = dtype_max / range_p; - // scale_p = [max(fp8_t)/range_o] * [range_p/max(fp8_t)] * [range_v/max(fp8_t)] - scale_o = range_p * range_v / range_o / dtype_max; - } - std::string vlayout = arg_parser.get_str("vlayout"); bool lse = arg_parser.get_bool("lse"); @@ -499,7 +480,7 @@ bool run(const ck_tile::ArgParser& arg_parser) const auto seqstart_k_host = to_seqstarts(seqlen_ks); const auto seqstart_k_with_padding_host = to_seqstarts(seqlen_kpads); - using TypeConfig = FmhaFwdTypeConfig; + using TypeConfig = FmhaFwdTypeConfig; using QDataType = typename TypeConfig::QDataType; using KDataType = typename TypeConfig::KDataType; @@ -513,6 +494,28 @@ bool run(const ck_tile::ArgParser& arg_parser) using OaccDataType = typename TypeConfig::OaccDataType; using ODataType = typename TypeConfig::ODataType; + float range_q = arg_parser.get_float("range_q"); + float range_k = arg_parser.get_float("range_k"); + float range_v = arg_parser.get_float("range_v"); + float range_p = arg_parser.get_float("range_p"); + float range_o = arg_parser.get_float("range_o"); + + float q_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + float k_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + float v_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + float p_dtype_max = v_dtype_max; // assume p and v is the same type + float o_dtype_max = ck_tile::type_convert(ck_tile::numeric::max()); + + float scale_p = 1.f; + float scale_o = 1.f; + + if(squant) + { + scale_s = scale_s * (range_q / q_dtype_max) * (range_k / k_dtype_max); + scale_p = p_dtype_max / range_p; + scale_o = (o_dtype_max / range_o) * (range_p / p_dtype_max) * (range_v / v_dtype_max); + } + // accumulation numbers for performance evaluation std::size_t flop = 0, num_byte = 0; auto max_seqlen_q = @@ -709,14 +712,14 @@ bool run(const ck_tile::ArgParser& arg_parser) else if(init_method == "ufq" || init_method == "uf:q" || init_method == "3") // suitable for fp8 quantization { - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(q_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(k_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(knew_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(v_host); - ck_tile::FillUniformDistribution{-dtype_max, dtype_max, seed}(vnew_host); + ck_tile::FillUniformDistribution{-q_dtype_max, q_dtype_max, seed}(q_host); + ck_tile::FillUniformDistribution{-k_dtype_max, k_dtype_max, seed}(k_host); + ck_tile::FillUniformDistribution{-k_dtype_max, k_dtype_max, seed}(knew_host); + ck_tile::FillUniformDistribution{-v_dtype_max, v_dtype_max, seed}(v_host); + ck_tile::FillUniformDistribution{-v_dtype_max, v_dtype_max, seed}(vnew_host); // bias_fp8 = qscale_bias * bias_fp32 - float qscale_bias = (dtype_max / range_q) * (dtype_max / range_k); + float qscale_bias = (q_dtype_max / range_q) * (k_dtype_max / range_k); // Assume bias is in [-1.f, 1.f] in original fp32 ck_tile::FillUniformDistribution{-qscale_bias, qscale_bias, seed}(bias_host); } @@ -1129,14 +1132,14 @@ bool run(const ck_tile::ArgParser& arg_parser) randval_buf.FromDevice(randval_host.data()); auto p_compute_element_func = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) return ck_tile::scales{scale_p}; else return ck_tile::identity{}; }(); auto oacc_element_func = [&]() { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) return ck_tile::composes(ck_tile::saturates{}, ck_tile::scales{scale_o}); else @@ -1186,7 +1189,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { decltype(q_host_ref) q_host_ref_ro(q_host_ref.get_lengths()); - auto [rotary_cos_slice, rotary_sin_slice] = + auto [rotary_cos_slice, rotary_sin_slice] = slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], real_seqlen_q); ck_tile::reference_batched_rotary_position_embedding( @@ -1202,13 +1205,13 @@ bool run(const ck_tile::ArgParser& arg_parser) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[0] / nr, i[1] % page_block_size, i[2]); }); - } else { + } else { k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(block_table_host(wb, i[1] / page_block_size), i[1] % page_block_size, i[0] / nr, i[2]); }); } } else -#endif +#endif { if(i_perm) k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[0] / nr, i[1] + key_offset, i[2]); }); else k_host_ref.ForEach([&](auto& self, auto i) { self(i) = k_host(cache_b_idx, i[1] + key_offset, i[0] / nr, i[2]); }); @@ -1229,7 +1232,7 @@ bool run(const ck_tile::ArgParser& arg_parser) { knew_host_ref_ro.emplace(knew_host_ref.get_lengths()); - auto [rotary_cos_slice, rotary_sin_slice] = + auto [rotary_cos_slice, rotary_sin_slice] = slice_rotary_cos_sin(rotary_cos_host, rotary_sin_host, cache_seqlen_ks[wb], seqlen_knew); ck_tile::reference_batched_rotary_position_embedding( @@ -1251,19 +1254,19 @@ bool run(const ck_tile::ArgParser& arg_parser) if(0 < page_block_size) { if(is_v_rowmajor) { if(i_perm) { - v_host_ref.ForEach([&](auto& self, auto i) { - self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]); + v_host_ref.ForEach([&](auto& self, auto i) { + self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[2] % page_block_size, i[1]); }); } else { - v_host_ref.ForEach([&](auto& self, auto i) { + v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[2] % page_block_size, i[0] / nr, i[1]); }); } } - else + else { - if(i_perm) { - v_host_ref.ForEach([&](auto& self, auto i) { + if(i_perm) { + v_host_ref.ForEach([&](auto& self, auto i) { self(i) = v_host(block_table_host(wb, i[2] / page_block_size), i[0] / nr, i[1], i[2] % page_block_size); }); } else { @@ -1458,7 +1461,7 @@ bool run(const ck_tile::ArgParser& arg_parser) else o_host_result.ForEach([&](auto& self, auto idx) { self(idx) = o_host(b_idx, idx[1] + query_offset, idx[0], idx[2]); }); // clang-format on - auto [rtol, atol] = get_elimit(init_method); + auto [rtol, atol] = get_elimit(init_method); bool cur_pass = ck_tile::check_err( o_host_result, o_host_ref, std::string("OUT Error: Incorrect results!"), rtol, atol); pass &= cur_pass; @@ -1515,15 +1518,15 @@ int main(int argc, char* argv[]) const std::string data_type = arg_parser.get_str("prec"); if(data_type == "fp16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } else if(data_type == "bf16") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } else if(data_type == "fp8") { - return run(arg_parser) ? 0 : -2; + return run(arg_parser) ? 0 : -2; } return -3; diff --git a/example/ck_tile/01_fmha/fmha_fwd.hpp b/example/ck_tile/01_fmha/fmha_fwd.hpp index 8a821b9177..aee54b4758 100644 --- a/example/ck_tile/01_fmha/fmha_fwd.hpp +++ b/example/ck_tile/01_fmha/fmha_fwd.hpp @@ -16,11 +16,35 @@ #include #include +struct FmhaFwdFp16 +{ +}; + +struct FmhaFwdBf16 +{ +}; + +struct FmhaFwdFp8 +{ +}; + +struct FmhaFwdBf8 +{ +}; + +struct FmhaFwdFp8Fp16 +{ +}; + +struct FmhaFwdFp8Bf16 +{ +}; + template struct FmhaFwdTypeConfig; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::half_t; using KDataType = ck_tile::half_t; @@ -36,7 +60,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::bf16_t; using KDataType = ck_tile::bf16_t; @@ -52,7 +76,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::fp8_t; using KDataType = ck_tile::fp8_t; @@ -68,7 +92,7 @@ struct FmhaFwdTypeConfig }; template <> -struct FmhaFwdTypeConfig +struct FmhaFwdTypeConfig { using QDataType = ck_tile::bf8_t; using KDataType = ck_tile::bf8_t;