diff --git a/include/ck_tile/ops/common/load_and_convert_tile.hpp b/include/ck_tile/ops/common/load_and_convert_tile.hpp index f2ee23e98b..b268ef6811 100644 --- a/include/ck_tile/ops/common/load_and_convert_tile.hpp +++ b/include/ck_tile/ops/common/load_and_convert_tile.hpp @@ -28,15 +28,14 @@ struct ConverterLoader } }; -template CK_TILE_DEVICE void load_and_convert_tile(WarpTile& dst, const WarpWindow& src) { - if constexpr(std::is_same_v) + if constexpr(std::is_same_v) { static_assert(!LoadTranspose, "LoadTranspose not supported with pk_int4_t"); ConverterLoader::load_interleaved_pk_type(dst, src); diff --git a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp index 040051a5e8..a22b0dcf65 100644 --- a/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp +++ b/include/ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp @@ -228,10 +228,10 @@ struct BlockUniversalGemmAsBsCr "The ADataType and BDataType as defined in " "traits should be the same as correspoinding block window data type!"); - load_and_convert_tile( - a_warp_tile_, a_block_window); - load_and_convert_tile( - b_warp_tile_, b_block_window); + load_and_convert_tile(a_warp_tile_, + a_block_window); + load_and_convert_tile(b_warp_tile_, + b_block_window); // hot loop: static_for<0, GemmTraits::KIterPerWarp, 1>{}([&](auto kIter) { static_for<0, MIterPerWarp, 1>{}([&](auto mIter) { @@ -294,10 +294,10 @@ struct BlockUniversalGemmAsBsCr bool_constant = {}, bool_constant = {}) { - load_and_convert_tile( - a_warp_tile_, a_block_window); - load_and_convert_tile( - b_warp_tile_, b_block_window); + load_and_convert_tile(a_warp_tile_, + a_block_window); + load_and_convert_tile(b_warp_tile_, + b_block_window); } // C += A * B @@ -425,10 +425,10 @@ struct BlockUniversalGemmAsBsCr auto b_lds_gemm_window = make_tile_window( b_block_window.get_bottom_tensor_view(), b_lds_shape, b_offset, b_lds_load_distr); - load_and_convert_tile( - a_warp_tile_, a_lds_gemm_window); - load_and_convert_tile( - b_warp_tile_, b_lds_gemm_window); + load_and_convert_tile(a_warp_tile_, + a_lds_gemm_window); + load_and_convert_tile(b_warp_tile_, + b_lds_gemm_window); } // C += A * B diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp index 6959e9e05a..e0556f6a6a 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp @@ -64,8 +64,7 @@ struct GemmPipelineAgBgCrImplBase CK_TILE_HOST_DEVICE static constexpr auto TransposeC() { return Problem::TransposeC; } - template (dst_block_tile, - dram_tile_window); + load_and_convert_tile(dst_block_tile, dram_tile_window); move_tile_window(dram_tile_window, dram_tile_window_step); } diff --git a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp index 9939a9586e..3f1e8dfc81 100644 --- a/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp +++ b/include/ck_tile/ops/gemm/pipeline/wp_pipeline_agmem_bgmem_creg_v2.hpp @@ -627,7 +627,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2 // // Prefetch A0 Base::GlobalPrefetch(a_global_tile, a_copy_dram_window, a_dram_tile_window_step); - Base::template GlobalPrefetch( + Base::template GlobalPrefetch( b_global_tile[0], b_flat_dram_window, b_dram_tile_window_step); // Prefill A0 @@ -652,7 +652,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2 do { { - Base::template GlobalPrefetch( + Base::template GlobalPrefetch( b_global_tile[1], b_flat_dram_window, b_dram_tile_window_step); Base::LocalPrefill(a_copy_lds_windows[I1], a_global_tile); Base::GlobalPrefetch( @@ -666,7 +666,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2 HotLoopScheduler(); } { - Base::template GlobalPrefetch( + Base::template GlobalPrefetch( b_global_tile[0], b_flat_dram_window, b_dram_tile_window_step); Base::LocalPrefill(a_copy_lds_windows[I0], a_global_tile); Base::GlobalPrefetch( @@ -687,7 +687,7 @@ struct WeightPreshufflePipelineAGmemBGmemCRegV2 if constexpr(TailNum == TailNumber::Even) { { - Base::template GlobalPrefetch( + Base::template GlobalPrefetch( b_global_tile[1], b_flat_dram_window, b_dram_tile_window_step); Base::LocalPrefill(a_copy_lds_windows[I1], a_global_tile); block_weight_preshuffle( diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp index 132b31ed62..5b4056e699 100644 --- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp +++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_bquant_cr.hpp @@ -261,10 +261,10 @@ struct ABQuantBlockUniversalGemmAsBsCr : public BlockGemmQuantBase bool_constant = {}, bool_constant = {}) { - load_and_convert_tile( + load_and_convert_tile( a_warp_tile_, a_block_window); // If B datatype were pkint4 it would be converted prior to storing in LDS - load_and_convert_tile( + load_and_convert_tile( b_warp_tile_, b_block_window); } diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp index b40168b2af..ea411441ff 100644 --- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp +++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_aquant_bs_cr.hpp @@ -248,10 +248,10 @@ struct AQuantBlockUniversalGemmAsBsCr // while ADatatype might not be the same as BDataType at the time of problem // initialization, we can safely use BDataType here because when A would be int4 we will // ensure A is converted to BDataType prior to loading - load_and_convert_tile( - a_warp_tile_, a_block_window); - load_and_convert_tile( - b_warp_tile_, b_block_window); + load_and_convert_tile(a_warp_tile_, + a_block_window); + load_and_convert_tile(b_warp_tile_, + b_block_window); } // C += A * B diff --git a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp index ece393b40d..cddc8b0dcd 100644 --- a/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp +++ b/include/ck_tile/ops/gemm_quant/block/block_universal_gemm_as_bs_bquant_cr.hpp @@ -258,11 +258,11 @@ struct BQuantBlockUniversalGemmAsBsCr bool_constant = {}, bool_constant = {}) { - load_and_convert_tile( - a_warp_tile_, a_block_window); + load_and_convert_tile(a_warp_tile_, + a_block_window); // If B datatype were pkint4 it would be converted prior to storing in LDS - load_and_convert_tile( - b_warp_tile_, b_block_window); + load_and_convert_tile(b_warp_tile_, + b_block_window); } // C += A * B diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_abquant_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_abquant_pipeline_ag_bg_cr_v3.hpp index bcb38bd3e8..2ff477e5ec 100644 --- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_abquant_pipeline_ag_bg_cr_v3.hpp +++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_abquant_pipeline_ag_bg_cr_v3.hpp @@ -198,10 +198,8 @@ struct ABQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(a_block_tile, - a_dram_window); + load_and_convert_tile(a_block_tile, a_dram_window); } template @@ -209,10 +207,8 @@ struct ABQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(b_block_tile, - b_dram_window); + load_and_convert_tile(b_block_tile, b_dram_window); } template ( + Base::template GlobalPrefetch( aq_block_tile[currIdx], aq_copy_dram_window, aq_dram_tile_window_step); - Base::template GlobalPrefetch( + Base::template GlobalPrefetch( bq_block_tile[currIdx], bq_copy_dram_window, bq_dram_tile_window_step); tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); @@ -436,10 +432,10 @@ struct ABQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(aq_block_tile[(currIdx + 1) % 2], + Base::template GlobalPrefetch(aq_block_tile[(currIdx + 1) % 2], aq_copy_dram_window, aq_dram_tile_window_step); - Base::template GlobalPrefetch(bq_block_tile[(currIdx + 1) % 2], + Base::template GlobalPrefetch(bq_block_tile[(currIdx + 1) % 2], bq_copy_dram_window, bq_dram_tile_window_step); @@ -471,10 +467,10 @@ struct ABQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(aq_block_tile[(currIdx + 1) % 2], + Base::template GlobalPrefetch(aq_block_tile[(currIdx + 1) % 2], aq_copy_dram_window, aq_dram_tile_window_step); - Base::template GlobalPrefetch(bq_block_tile[(currIdx + 1) % 2], + Base::template GlobalPrefetch(bq_block_tile[(currIdx + 1) % 2], bq_copy_dram_window, bq_dram_tile_window_step); block_gemm(c_block_tile, diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_mem.hpp index 1742dbf639..fc1d14a737 100644 --- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_mem.hpp @@ -175,10 +175,8 @@ struct AQuantGemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem const DramTileWindowStep& dram_tile_window_step) { using DestDataType = typename ABlockTile_::DataType; - using SrcDataType = typename ADramWindow::Base::TileWindowBase::DataType; constexpr index_t UnaryOpSize = 8; - load_and_convert_tile(a_block_tile, - a_dram_window); + load_and_convert_tile(a_block_tile, a_dram_window); move_tile_window(a_dram_window, dram_tile_window_step); } @@ -286,9 +284,9 @@ struct AQuantGemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem // Global prefetch initialization - DRAM to VGPRs LoadAndConvertATile( a_block_tiles.get(I0{}), a_copy_dram_window, a_dram_tile_window_step); - Base::template GlobalPrefetch( + Base::template GlobalPrefetch( b_block_tiles.get(I0{}), b_copy_dram_window, b_dram_tile_window_step); - Base::template GlobalPrefetch( + Base::template GlobalPrefetch( aq_block_tiles.get(I0{}), aq_copy_dram_window, aq_dram_tile_window_step); tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); @@ -321,10 +319,10 @@ struct AQuantGemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem LoadAndConvertATile(a_block_tiles.get(number{}), a_copy_dram_window, a_dram_tile_window_step); - Base::template GlobalPrefetch(b_block_tiles.get(number{}), + Base::template GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window, b_dram_tile_window_step); - Base::template GlobalPrefetch(aq_block_tiles.get(number{}), + Base::template GlobalPrefetch(aq_block_tiles.get(number{}), aq_copy_dram_window, aq_dram_tile_window_step); }); @@ -381,10 +379,10 @@ struct AQuantGemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem LoadAndConvertATile(a_block_tiles.get(number{}), a_copy_dram_window, a_dram_tile_window_step); - Base::template GlobalPrefetch(b_block_tiles.get(number{}), + Base::template GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window, b_dram_tile_window_step); - Base::template GlobalPrefetch(aq_block_tiles.get(number{}), + Base::template GlobalPrefetch(aq_block_tiles.get(number{}), aq_copy_dram_window, aq_dram_tile_window_step); }); diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp index 114a89d95f..e2d9f50299 100644 --- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp +++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_aquant_pipeline_ag_bg_cr_v3.hpp @@ -169,10 +169,8 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(a_block_tile, - a_dram_window); + load_and_convert_tile(a_block_tile, a_dram_window); } template (b_block_tile, b_copy_dram_window, b_dram_tile_window_step); - Base::template GlobalPrefetch( + Base::template GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); + Base::template GlobalPrefetch( aq_block_tile[currIdx], aq_copy_dram_window, aq_dram_tile_window_step); tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); @@ -309,7 +307,7 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); + Base::template GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); block_sync_lds(); @@ -352,8 +350,8 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); - Base::template GlobalPrefetch(aq_block_tile[(currIdx + 1) % 2], + Base::template GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); + Base::template GlobalPrefetch(aq_block_tile[(currIdx + 1) % 2], aq_copy_dram_window, aq_dram_tile_window_step); @@ -379,7 +377,7 @@ struct AQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(aq_block_tile[(currIdx + 1) % 2], + Base::template GlobalPrefetch(aq_block_tile[(currIdx + 1) % 2], aq_copy_dram_window, aq_dram_tile_window_step); block_gemm( diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp index e0e67355e4..7ef79bd32d 100644 --- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp +++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_bquant_pipeline_ag_bg_cr_v3.hpp @@ -183,10 +183,8 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(b_block_tile, - b_dram_window); + load_and_convert_tile(b_block_tile, b_dram_window); } template @@ -202,7 +200,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); + Base::template GlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); } } @@ -312,10 +310,10 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); + Base::template GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); // B tile gets converted to A datatype during loading BGlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); - Base::template GlobalPrefetch( + Base::template GlobalPrefetch( bq_block_tile[currIdx], bq_copy_dram_window, bq_dram_tile_window_step); tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); @@ -345,7 +343,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); + Base::template GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); // B tile gets converted to A datatype during loading BGlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); @@ -389,10 +387,10 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); + Base::template GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); // B tile gets converted to A datatype during loading BGlobalPrefetch(b_block_tile, b_copy_dram_window, b_dram_tile_window_step); - Base::template GlobalPrefetch(bq_block_tile[(currIdx + 1) % 2], + Base::template GlobalPrefetch(bq_block_tile[(currIdx + 1) % 2], bq_copy_dram_window, bq_dram_tile_window_step); @@ -418,7 +416,7 @@ struct BQuantGemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(bq_block_tile[(currIdx + 1) % 2], + Base::template GlobalPrefetch(bq_block_tile[(currIdx + 1) % 2], bq_copy_dram_window, bq_dram_tile_window_step); block_gemm( diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_mxfp4_pipeline_ag_bg_cr_v3.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_mxfp4_pipeline_ag_bg_cr_v3.hpp index e8af4bd893..7d090a788a 100644 --- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_mxfp4_pipeline_ag_bg_cr_v3.hpp +++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_mxfp4_pipeline_ag_bg_cr_v3.hpp @@ -419,8 +419,8 @@ struct MxFp4GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); - Base::template GlobalPrefetch(b_fp4_block_tile, b_copy_dram_window, b_dram_tile_window_step); + Base::template GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); + Base::template GlobalPrefetch(b_fp4_block_tile, b_copy_dram_window, b_dram_tile_window_step); // BDataType auto b_block_tile = make_static_distributed_tensor( Policy::template MakeBRegTileDistribution()); @@ -480,8 +480,8 @@ struct MxFp4GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); - Base::template GlobalPrefetch(b_fp4_block_tile, b_copy_dram_window, b_dram_tile_window_step); + Base::template GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); + Base::template GlobalPrefetch(b_fp4_block_tile, b_copy_dram_window, b_dram_tile_window_step); bq_block_tile = load_tile(bq_copy_dram_window); move_tile_window(bq_copy_dram_window, {0, b_scale_dram_tile_window_step}); @@ -544,8 +544,8 @@ struct MxFp4GemmPipelineAgBgCrCompV3 : public BaseGemmPipelineAgBgCrCompV3(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); - Base::template GlobalPrefetch( + Base::template GlobalPrefetch(a_block_tile, a_copy_dram_window, a_dram_tile_window_step); + Base::template GlobalPrefetch( b_fp4_block_tile, b_copy_dram_window, b_dram_tile_window_step); bq_block_tile = load_tile(bq_copy_dram_window); diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp index 49064bdb76..f3886146ee 100644 --- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp +++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_abquant_pipeline_ag_bg_cr_v2.hpp @@ -349,7 +349,7 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe move_tile_window(b_flat_dram_windows(nIter)(kIter), {nIter * flatNPerWarp, kIter * flatKPerWarp}); - load_and_convert_tile( + load_and_convert_tile( b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter)); }); }); @@ -430,7 +430,7 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe move_tile_window(b_flat_dram_windows(nIter)(kIter), {nIter * flatNPerWarp, kIter * flatKPerWarp}); - load_and_convert_tile( + load_and_convert_tile( b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter)); }); }); @@ -455,7 +455,7 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe move_tile_window(b_flat_dram_windows(nIter)(kIter), {nIter * flatNPerWarp, kIter * flatKPerWarp}); - load_and_convert_tile( + load_and_convert_tile( b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter)); }); }); @@ -503,7 +503,7 @@ struct WPABQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRe move_tile_window(b_flat_dram_windows(nIter)(kIter), {nIter * flatNPerWarp, kIter * flatKPerWarp}); - load_and_convert_tile( + load_and_convert_tile( b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter)); }); }); diff --git a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp index 5455944de0..3ba7064d09 100644 --- a/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp +++ b/include/ck_tile/ops/gemm_quant/pipeline/gemm_wp_bquant_pipeline_ag_bg_cr_v2.hpp @@ -335,8 +335,8 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV move_tile_window(b_flat_dram_windows(nIter)(kIter), {nIter * flatNPerWarp, kIter * flatKPerWarp}); - load_and_convert_tile( - b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter)); + load_and_convert_tile(b_warp_tensor_ping(nIter)(kIter), + b_flat_dram_windows(nIter)(kIter)); }); }); // move B window to next flat K @@ -421,7 +421,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV move_tile_window(b_flat_dram_windows(nIter)(kIter), {nIter * flatNPerWarp, kIter * flatKPerWarp}); - load_and_convert_tile( + load_and_convert_tile( b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter)); }); }); @@ -458,7 +458,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV move_tile_window(b_flat_dram_windows(nIter)(kIter), {nIter * flatNPerWarp, kIter * flatKPerWarp}); - load_and_convert_tile( + load_and_convert_tile( b_warp_tensor_ping(nIter)(kIter), b_flat_dram_windows(nIter)(kIter)); }); }); @@ -516,7 +516,7 @@ struct WPQuantBPipelineAgBgCrV2 : public WeightPreshufflePipelineAGmemBGmemCRegV move_tile_window(b_flat_dram_windows(nIter)(kIter), {nIter * flatNPerWarp, kIter * flatKPerWarp}); - load_and_convert_tile( + load_and_convert_tile( b_warp_tensor_pong(nIter)(kIter), b_flat_dram_windows(nIter)(kIter)); }); });