From 18114323d53374d66b8af76ccf07e852456092f7 Mon Sep 17 00:00:00 2001 From: jakpiase Date: Thu, 6 Feb 2025 21:12:13 +0100 Subject: [PATCH] add vectorloads on non-k dim for memory pipelines (#1856) [ROCm/composable_kernel commit: 9b5dfba2422964ec17bc20bf06227ea9771a6ad8] --- .../pipeline/gemm_pipeline_ag_bg_cr_mem.hpp | 291 ++++++++++++++---- ...emm_universal_pipeline_ag_bg_cr_policy.hpp | 2 +- test/ck_tile/gemm/test_gemm_pipeline.cpp | 30 +- 3 files changed, 251 insertions(+), 72 deletions(-) diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp index 38c663f4c3..e23f0cda7d 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp @@ -90,7 +90,7 @@ struct BaseGemmPipelineAgBgCrMem // LocalPreFillStages: 1 // LocalPreFetchStages: 0 // LocalSharedMemoryBuffer: 1 -template +template struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem { using Base = BaseGemmPipelineAgBgCrMem; @@ -165,11 +165,22 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem "A/B Dram block window should have the same data type as appropriate " "([A|B]DataType) defined in Problem definition!"); - static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && - NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && - KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}], - "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" - " or KPerBlock!"); + constexpr bool is_a_col_major = + std::is_same_v; + constexpr bool is_b_row_major = std::is_same_v; + + static_assert(is_a_col_major + ? (KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}]) + : (MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}]), + "A block window has incorrect lengths for defined ALayout!"); + static_assert(is_b_row_major + ? (KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}]) + : (NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}]), + "B block window has incorrect lengths for defined BLayout!"); // ------------------------------------------------------------------------------------ // Definitions of all needed tiles @@ -213,25 +224,59 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem tuple_array a_block_tiles; tuple_array b_block_tiles; + using ADramTileWindowStep = typename ADramBlockWindowTmp::BottomTensorIndex; + using BDramTileWindowStep = typename BDramBlockWindowTmp::BottomTensorIndex; + + constexpr ADramTileWindowStep a_dram_tile_window_step = + is_a_col_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock); + constexpr BDramTileWindowStep b_dram_tile_window_step = + is_b_row_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock); + // ----------------------------------------------------------------------------------------- // Gemm pipeline start // prefetch // global read 0 - Base::GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); - Base::GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); + Base::GlobalPrefetch( + a_block_tiles.get(I0{}), a_copy_dram_window, a_dram_tile_window_step); + Base::GlobalPrefetch( + b_block_tiles.get(I0{}), b_copy_dram_window, b_dram_tile_window_step); // initialize C tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); // LDS write 0 - Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); - Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + if constexpr(is_a_col_major) + { + auto a_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledARegTileDistribution()); + transpose_tile2d(a_shuffle_tmp, a_block_tiles.get(I0{})); + Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func); + } + else + { + Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); + } + if constexpr(is_b_row_major) + { + auto b_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledBRegTileDistribution()); + transpose_tile2d(b_shuffle_tmp, b_block_tiles.get(I0{})); + Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func); + } + else + { + Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + } // Global prefetch [1, PrefetchStages] static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) { - Base::GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); - Base::GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), + a_copy_dram_window, + a_dram_tile_window_step); + Base::GlobalPrefetch(b_block_tiles.get(number{}), + b_copy_dram_window, + b_dram_tile_window_step); }); // main body @@ -247,19 +292,45 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_sync_lds(); - Base::LocalPrefill( - a_copy_lds_window, - a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), - a_element_func); - Base::LocalPrefill( - b_copy_lds_window, - b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), - b_element_func); + if constexpr(is_a_col_major) + { + auto a_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledARegTileDistribution()); + transpose_tile2d( + a_shuffle_tmp, + a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{})); + Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func); + } + else + { + Base::LocalPrefill( + a_copy_lds_window, + a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + a_element_func); + } + if constexpr(is_b_row_major) + { + auto b_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledBRegTileDistribution()); + transpose_tile2d( + b_shuffle_tmp, + b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{})); + Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func); + } + else + { + Base::LocalPrefill( + b_copy_lds_window, + b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + b_element_func); + } Base::GlobalPrefetch(a_block_tiles.get(number{}), - a_copy_dram_window); + a_copy_dram_window, + a_dram_tile_window_step); Base::GlobalPrefetch(b_block_tiles.get(number{}), - b_copy_dram_window); + b_copy_dram_window, + b_dram_tile_window_step); }); i += PrefetchStages; @@ -275,12 +346,32 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_sync_lds(); - Base::LocalPrefill(a_copy_lds_window, - a_block_tiles.get(number{}), - a_element_func); - Base::LocalPrefill(b_copy_lds_window, - b_block_tiles.get(number{}), - b_element_func); + if constexpr(is_a_col_major) + { + auto a_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledARegTileDistribution()); + transpose_tile2d(a_shuffle_tmp, a_block_tiles.get(number{})); + Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func); + } + else + { + Base::LocalPrefill(a_copy_lds_window, + a_block_tiles.get(number{}), + a_element_func); + } + if constexpr(is_b_row_major) + { + auto b_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledBRegTileDistribution()); + transpose_tile2d(b_shuffle_tmp, b_block_tiles.get(number{})); + Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func); + } + else + { + Base::LocalPrefill(b_copy_lds_window, + b_block_tiles.get(number{}), + b_element_func); + } }); block_sync_lds(); @@ -352,11 +443,22 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem "A/B Dram block window should have the same data type as appropriate " "([A|B]DataType) defined in Problem definition!"); - static_assert(MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && - NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && - KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}], - "A/B block window appropriate sizes must be equal to MPerBlock/NPerblock" - " or KPerBlock!"); + constexpr bool is_a_col_major = + std::is_same_v; + constexpr bool is_b_row_major = std::is_same_v; + + static_assert(is_a_col_major + ? (KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}]) + : (MPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == ADramBlockWindowTmp{}.get_window_lengths()[I1{}]), + "A block window has incorrect lengths for defined ALayout!"); + static_assert(is_b_row_major + ? (KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}]) + : (NPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I0{}] && + KPerBlock == BDramBlockWindowTmp{}.get_window_lengths()[I1{}]), + "B block window has incorrect lengths for defined BLayout!"); // ------------------------------------------------------------------------------------ // Definitions of all needed tiles @@ -400,25 +502,58 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem tuple_array a_block_tiles; tuple_array b_block_tiles; + using ADramTileWindowStep = typename ADramBlockWindowTmp::BottomTensorIndex; + using BDramTileWindowStep = typename BDramBlockWindowTmp::BottomTensorIndex; + + constexpr ADramTileWindowStep a_dram_tile_window_step = + is_a_col_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock); + constexpr BDramTileWindowStep b_dram_tile_window_step = + is_b_row_major ? make_array(KPerBlock, 0) : make_array(0, KPerBlock); // ----------------------------------------------------------------------------------------- // Gemm pipeline start // prefetch // global read 0 - Base::GlobalPrefetch(a_block_tiles.get(I0{}), a_copy_dram_window); - Base::GlobalPrefetch(b_block_tiles.get(I0{}), b_copy_dram_window); + Base::GlobalPrefetch( + a_block_tiles.get(I0{}), a_copy_dram_window, a_dram_tile_window_step); + Base::GlobalPrefetch( + b_block_tiles.get(I0{}), b_copy_dram_window, b_dram_tile_window_step); // initialize C tile_elementwise_inout([](auto& c) { c = 0; }, c_block_tile); // LDS write 0 - Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); - Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + if constexpr(is_a_col_major) + { + auto a_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledARegTileDistribution()); + transpose_tile2d(a_shuffle_tmp, a_block_tiles.get(I0{})); + Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func); + } + else + { + Base::LocalPrefill(a_copy_lds_window, a_block_tiles.get(I0{}), a_element_func); + } + if constexpr(is_b_row_major) + { + auto b_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledBRegTileDistribution()); + transpose_tile2d(b_shuffle_tmp, b_block_tiles.get(I0{})); + Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func); + } + else + { + Base::LocalPrefill(b_copy_lds_window, b_block_tiles.get(I0{}), b_element_func); + } // Global prefetch [1, PrefetchStages] static_for<1, PrefetchStages, 1>{}([&](auto prefetch_idx) { - Base::GlobalPrefetch(a_block_tiles.get(number{}), a_copy_dram_window); - Base::GlobalPrefetch(b_block_tiles.get(number{}), b_copy_dram_window); + Base::GlobalPrefetch(a_block_tiles.get(number{}), + a_copy_dram_window, + a_dram_tile_window_step); + Base::GlobalPrefetch(b_block_tiles.get(number{}), + b_copy_dram_window, + b_dram_tile_window_step); }); // main body @@ -432,19 +567,45 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); // no second block_sync_lds because it's interwave - Base::LocalPrefill( - a_copy_lds_window, - a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), - a_element_func); - Base::LocalPrefill( - b_copy_lds_window, - b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), - b_element_func); + if constexpr(is_a_col_major) + { + auto a_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledARegTileDistribution()); + transpose_tile2d( + a_shuffle_tmp, + a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{})); + Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func); + } + else + { + Base::LocalPrefill( + a_copy_lds_window, + a_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + a_element_func); + } + if constexpr(is_b_row_major) + { + auto b_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledBRegTileDistribution()); + transpose_tile2d( + b_shuffle_tmp, + b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{})); + Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func); + } + else + { + Base::LocalPrefill( + b_copy_lds_window, + b_block_tiles.get(number<(prefetch_idx + 1) % PrefetchStages>{}), + b_element_func); + } Base::GlobalPrefetch(a_block_tiles.get(number{}), - a_copy_dram_window); + a_copy_dram_window, + a_dram_tile_window_step); Base::GlobalPrefetch(b_block_tiles.get(number{}), - b_copy_dram_window); + b_copy_dram_window, + b_dram_tile_window_step); }); i += PrefetchStages; @@ -457,12 +618,32 @@ struct GemmPipelineAgBgCrMem : public BaseGemmPipelineAgBgCrMem block_gemm(c_block_tile, a_lds_gemm_window, b_lds_gemm_window); // no second block_sync_lds because it's interwave - Base::LocalPrefill(a_copy_lds_window, - a_block_tiles.get(number{}), - a_element_func); - Base::LocalPrefill(b_copy_lds_window, - b_block_tiles.get(number{}), - b_element_func); + if constexpr(is_a_col_major) + { + auto a_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledARegTileDistribution()); + transpose_tile2d(a_shuffle_tmp, a_block_tiles.get(number{})); + Base::LocalPrefill(a_copy_lds_window, a_shuffle_tmp, a_element_func); + } + else + { + Base::LocalPrefill(a_copy_lds_window, + a_block_tiles.get(number{}), + a_element_func); + } + if constexpr(is_b_row_major) + { + auto b_shuffle_tmp = make_static_distributed_tensor( + Policy::template MakeShuffledBRegTileDistribution()); + transpose_tile2d(b_shuffle_tmp, b_block_tiles.get(number{})); + Base::LocalPrefill(b_copy_lds_window, b_shuffle_tmp, b_element_func); + } + else + { + Base::LocalPrefill(b_copy_lds_window, + b_block_tiles.get(number{}), + b_element_func); + } }); block_sync_lds(); diff --git a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp index 33f105a435..feed32a439 100644 --- a/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp +++ b/include/ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp @@ -519,7 +519,7 @@ struct UniversalGemmPipelineAgBgCrPolicy using ALayout = remove_cvref_t; static_assert(std::is_same_v); constexpr index_t BlockSize = Problem::kBlockSize; - constexpr index_t MPerBlock = Problem::BlockGemmShape::kN; + constexpr index_t MPerBlock = Problem::BlockGemmShape::kM; constexpr index_t KPerBlock = Problem::BlockGemmShape::kK; constexpr index_t VecLoadSize = GetVectorSizeA(); diff --git a/test/ck_tile/gemm/test_gemm_pipeline.cpp b/test/ck_tile/gemm/test_gemm_pipeline.cpp index faffe848d5..5193f2db20 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline.cpp +++ b/test/ck_tile/gemm/test_gemm_pipeline.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. #include @@ -14,28 +14,26 @@ using Row = ck_tile::tensor_layout::gemm::RowMajor; using Col = ck_tile::tensor_layout::gemm::ColumnMajor; using Intrawave = ck_tile::integral_constant; -// using Interwave = ck_tile::integral_constant; -// using Mem = ck_tile::integral_constant; -using Comp = ck_tile::integral_constant; - -// TODO: Enable Memory pipeline, when it would be updated for vector loads on non-K major tensors. +using Interwave = ck_tile::integral_constant; +using Mem = ck_tile::integral_constant; +using Comp = ck_tile::integral_constant; // clang-format off using KernelTypes = ::testing::Types< // ALayout, BLayout, CLayout, ADataType, BDataType, AccDataType, CDataType, GemmPipelineScheduler, PipelineType - // std::tuple< Row, Row, Row, F16, F16, F32, F16, Intrawave, Mem>, + std::tuple< Row, Row, Row, F16, F16, F32, F16, Intrawave, Mem>, std::tuple< Row, Row, Row, F16, F16, F32, F16, Intrawave, Comp>, - // std::tuple< Row, Row, Row, F16, F16, F32, F16, Interwave, Mem>, - // std::tuple< Row, Col, Row, F16, F16, F32, F16, Intrawave, Mem>, + std::tuple< Row, Row, Row, F16, F16, F32, F16, Interwave, Mem>, + std::tuple< Row, Col, Row, F16, F16, F32, F16, Intrawave, Mem>, std::tuple< Row, Col, Row, F16, F16, F32, F16, Intrawave, Comp>, - // std::tuple< Row, Col, Row, F16, F16, F32, F16, Interwave, Mem>, - // std::tuple< Col, Row, Row, F16, F16, F32, F16, Intrawave, Mem>, + std::tuple< Row, Col, Row, F16, F16, F32, F16, Interwave, Mem>, + std::tuple< Col, Row, Row, F16, F16, F32, F16, Intrawave, Mem>, std::tuple< Col, Row, Row, F16, F16, F32, F16, Intrawave, Comp>, - // std::tuple< Col, Row, Row, F16, F16, F32, F16, Interwave, Mem>, - // std::tuple< Col, Col, Row, F16, F16, F32, F16, Intrawave, Mem>, - std::tuple< Col, Col, Row, F16, F16, F32, F16, Intrawave, Comp> - // std::tuple< Col, Col, Row, F16, F16, F32, F16, Interwave, Mem> + std::tuple< Col, Row, Row, F16, F16, F32, F16, Interwave, Mem>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Intrawave, Mem>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Intrawave, Comp>, + std::tuple< Col, Col, Row, F16, F16, F32, F16, Interwave, Mem> >; // clang-format on