diff --git a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp index 73d622ec9c..17651e229f 100644 --- a/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp +++ b/include/ck_tile/ops/fmha/kernel/fmha_fwd_appendkv_kernel.hpp @@ -403,16 +403,6 @@ struct FmhaFwdAppendKVKernel const index_t num_blocks = integer_divide_ceil(kargs.seqlen_k + kargs.seqlen_knew, kargs.page_block_size); - DEVICE_DEBUG_STMTS - { - printf("[DEVICE] block_indics: "); - for(index_t i_block = 0; i_block < num_blocks; ++i_block) - { - printf("(%d, %d) ", i_block, block_indices[i_block]); - } - printf("\n"); - } - const long_index_t fixed_offset = static_cast(i_nhead_ / kargs.nhead_ratio_qk) * kargs.nhead_stride_v; @@ -723,14 +713,7 @@ struct FmhaFwdAppendKVKernel auto [i_block0, k_dram_window_tmp] = k_tile_navigator.make_tile_window( k_dram_window, {skip_append_kv ? 0 : kargs.seqlen_k + i_n0, 0}); - DEVICE_DEBUG_STMTS - { - printf("[DEVICE] i_block0: %d\n", i_block0); - auto local_origin = k_dram_window_tmp.get_window_origin(); - printf("[DEVICE] origin: (%d, %d)\n", - local_origin.at(number<0>{}), - local_origin.at(number<1>{})); - } + auto knew_dram_window = make_tile_window(knew_dram, make_tuple(number{}, number{}), @@ -744,37 +727,12 @@ struct FmhaFwdAppendKVKernel auto [i_block1, v_dram_window_tmp] = v_tile_navigator.make_tile_window( v_dram_window, {0, skip_append_kv ? 0 : kargs.seqlen_k + i_n0}); - if constexpr(kIsPagedKV) - { - DEVICE_DEBUG_STMTS - { - printf("[DEVICE] i_block1: %d\n", i_block1); - auto local_origin = v_dram_window_tmp.get_window_origin(); - printf("[DEVICE] origin: (%d, %d)\n", - local_origin.at(number<0>{}), - local_origin.at(number<1>{})); - printf("[DEVICE] psychical block_ptr 0: %p\n", - static_cast(v_tile_navigator.physical_blocks + - 0 * v_tile_navigator.block_stride)); - printf("[DEVICE] psychical block_ptr 1: %p\n", - static_cast(v_tile_navigator.physical_blocks + - 1 * v_tile_navigator.block_stride)); - - printf("[DEVICE] tile window data ptr: %p\n", - static_cast(v_dram_window_tmp.get_bottom_tensor_view().buf_.p_data_)); - } - } auto vnew_dram_window = make_tile_window(vnew_dram, make_tuple(number{}, number{}), {0, i_n0}); - DEVICE_DEBUG_STMTS - { - printf("[DEVICE] skip_transform_q: %d, skip_appendkv: %d\n", - kargs.seqlen_q <= i_m0, - kargs.seqlen_knew <= i_n0); - } + if constexpr(kApplyRoPE) { FmhaPipeline{}(q_dram_window, diff --git a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp index 6f47a1410b..d70e6322e6 100644 --- a/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp +++ b/include/ck_tile/ops/fmha/pipeline/block_fmha_fwd_splitkv_pipeline_qr_ks_vs.hpp @@ -231,16 +231,7 @@ struct BlockFmhaFwdSplitKVPipelineQRKSVS }(); const auto num_total_loop = integer_divide_ceil(seqlen_k_end - adjusted_seqlen_k_start, kN0); -#if 0 - DEVICE_DEBUG_STMTS - { - printf("[DEVICE] seqlen_k_start: %d, seqlen_k_end: %d\n", seqlen_k_start, seqlen_k_end); - printf("[DEVICE] adjusted_seqlen_k_start: %d, num_total_loop: %d\n", - adjusted_seqlen_k_start, - num_total_loop); - printf("[DEVICE] kHasUnevenSplits: %d\n", kHasUnevenSplits); - } -#endif + // check early exit if masked and no work to do. if constexpr(FmhaMask::IsMasking || kHasUnevenSplits) {