From e45991b379dafcce7f006ee970cf0f4f0b3e2cf1 Mon Sep 17 00:00:00 2001 From: Aleksander Dudek Date: Tue, 21 Oct 2025 06:19:38 -0500 Subject: [PATCH] [CK_TILE] Enable vector stores for C Column Layout part1 --- include/ck_tile/core/tensor/buffer_view.hpp | 14 +- .../ops/epilogue/cshuffle_epilogue.hpp | 130 ++++++- .../ops/gemm/kernel/universal_gemm_kernel.hpp | 47 ++- .../gemm/test_gemm_pipeline_kernel_types.hpp | 80 ++-- .../test_gemm_pipeline_universal_run_test.inc | 361 ++++++++++++++++++ .../gemm/test_gemm_pipeline_ut_cases.inc | 6 +- test/ck_tile/gemm/test_gemm_pipeline_util.hpp | 63 +-- 7 files changed, 602 insertions(+), 99 deletions(-) create mode 100644 test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc diff --git a/include/ck_tile/core/tensor/buffer_view.hpp b/include/ck_tile/core/tensor/buffer_view.hpp index 3729a0de5c..2a15456014 100644 --- a/include/ck_tile/core/tensor/buffer_view.hpp +++ b/include/ck_tile/core/tensor/buffer_view.hpp @@ -615,9 +615,9 @@ struct buffer_view>::scalar_type; // X contains multiple T - constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; + constexpr index_t scalar_per_t_vector = vector_traits>::vector_size; // 1 - constexpr index_t scalar_per_x_vector = vector_traits>::vector_size; + constexpr index_t scalar_per_x_vector = vector_traits>::vector_size; // 8 static_assert(scalar_per_x_vector % scalar_per_t_vector == 0, "wrong! X should contain multiple T"); @@ -650,7 +650,15 @@ struct buffer_view scalar_per_t_vector, "Condition not met: (( scalar_per_x_vector > scalar_per_t_vector ))"); + //if(threadIdx.x == 0) + //{ + // printf("[DEBUG]: BufferView: t_per_x: %d\n",t_per_x); + // printf("[DEBUG]: BufferView: scalar_per_x_vector: %d\n",scalar_per_x_vector); + // printf("[DEBUG]: BufferView: scalar_per_t_vector: %d\n",scalar_per_t_vector); + // printf("[DEBUG]: BufferView: x.size(): %d\n",x.size()); + //} if constexpr(use_amd_buffer_addressing) { diff --git a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp index 8a84f7e9bf..0192708149 100644 --- a/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp +++ b/include/ck_tile/ops/epilogue/cshuffle_epilogue.hpp @@ -65,6 +65,24 @@ struct CShuffleEpilogueProblem static_assert(NumDTensor == DsLayout::size(), "The size of DsDataType and DsLayout should be the same"); + + CK_TILE_HOST static void PrintInfo() { + printf("[DEBUG]: CShuffleEpilogueProblem: kBlockSize: %d\n",kBlockSize); + printf("[DEBUG]: CShuffleEpilogueProblem: kMPerBlock: %d\n",kMPerBlock); + printf("[DEBUG]: CShuffleEpilogueProblem: kNPerBlock: %d\n",kNPerBlock); + printf("[DEBUG]: CShuffleEpilogueProblem: MWave: %d\n",MWave); + printf("[DEBUG]: CShuffleEpilogueProblem: NWave: %d\n",NWave); + printf("[DEBUG]: CShuffleEpilogueProblem: MPerXdl: %d\n",MPerXdl); + printf("[DEBUG]: CShuffleEpilogueProblem: NPerXdl: %d\n",NPerXdl); + printf("[DEBUG]: CShuffleEpilogueProblem: KPerXdl: %d\n",KPerXdl); + printf("[DEBUG]: CShuffleEpilogueProblem: isCTransposed: %d\n",isCTransposed); + printf("[DEBUG]: CShuffleEpilogueProblem: MemoryOperation: %d\n",static_cast(MemoryOperation)); + printf("[DEBUG]: CShuffleEpilogueProblem: FixedVectorSize: %d\n",static_cast(FixedVectorSize)); + printf("[DEBUG]: CShuffleEpilogueProblem: VectorSizeC: %d\n",VectorSizeC); + printf("[DEBUG]: CShuffleEpilogueProblem: TiledMMAPermuteN: %d\n",static_cast(TiledMMAPermuteN)); + printf("[DEBUG]: CShuffleEpilogueProblem: kNumWaveGroups: %d\n",kNumWaveGroups); + printf("[DEBUG]: CShuffleEpilogueProblem: NumDTensor: %d\n",NumDTensor); + } }; template @@ -122,6 +140,19 @@ struct CShuffleEpilogue CDElementwise elfunc_; CK_TILE_DEVICE CShuffleEpilogue(CDElementwise elfunc = CDElementwise{}) : elfunc_(elfunc) {}; + static constexpr bool IsERowMajor = + std::is_same_v ? true : false; + + CK_TILE_HOST static void PrintInfo() { + printf("[DEBUG]: CShuffleEpilogue: MPerIteration: %d\n",MPerIteration); + printf("[DEBUG]: CShuffleEpilogue: NPerIteration: %d\n",NPerIteration); + printf("[DEBUG]: CShuffleEpilogue: MRepeat: %d\n",MRepeat); + printf("[DEBUG]: CShuffleEpilogue: NRepeat: %d\n",NRepeat); + printf("[DEBUG]: CShuffleEpilogue: GetVectorSizeC: %d\n",GetVectorSizeC()); + printf("[DEBUG]: CShuffleEpilogue: get_warp_size: %d\n",get_warp_size()); + printf("[DEBUG]: CShuffleEpilogue: MPerIterationShuffle: %d\n",MPerIterationShuffle); + printf("[DEBUG]: CShuffleEpilogue: NPerIterationShuffle: %d\n",NPerIterationShuffle); + } static_assert(NumDTensor == DsLayout::size(), "The size of DsDataType and DsLayout should be the same"); @@ -248,6 +279,22 @@ struct CShuffleEpilogue static constexpr index_t MPerIterationShuffle = std::get<0>(MNPerIterationShuffle); static constexpr index_t NPerIterationShuffle = std::get<1>(MNPerIterationShuffle); + static constexpr index_t NumYXdlPerWavePerShuffle = + IsERowMajor ? NumMXdlPerWavePerShuffle : NumNXdlPerWavePerShuffle; + static constexpr index_t NumXXdlPerWavePerShuffle = + IsERowMajor ? NumNXdlPerWavePerShuffle : NumMXdlPerWavePerShuffle; + + static constexpr index_t YPerIterationShuffle = + IsERowMajor ? MPerIterationShuffle : NPerIterationShuffle; + static constexpr index_t XPerIterationShuffle = + IsERowMajor ? NPerIterationShuffle : MPerIterationShuffle; + + static constexpr index_t YPerBlock = IsERowMajor ? kMPerBlock : kNPerBlock; + static constexpr index_t XPerBlock = IsERowMajor ? kNPerBlock : kMPerBlock; + + static constexpr index_t YWave = IsERowMajor ? MWave : NWave; + static constexpr index_t XWave = IsERowMajor ? NWave : MWave; + using WG = WarpGemmDispatcher) { return make_naive_tensor_descriptor( - make_tuple(number{}, number{}), - make_tuple(number<1>{}, number{})); + make_tuple(number{}, number{}), + make_tuple(number{}, number<1>{})); } else { @@ -292,8 +339,8 @@ struct CShuffleEpilogue if constexpr(BlockedXDLN_PerWarp == 1) { return tile_distribution_encoding, - tuple, - sequence>, + tuple, + sequence>, tuple>, tuple>, sequence<1, 2>, @@ -301,12 +348,12 @@ struct CShuffleEpilogue } else { - constexpr int RakedXDLN_PerWarp = NumNXdlPerWavePerShuffle / BlockedXDLN_PerWarp; + constexpr int RakedXDLN_PerWarp = NumXXdlPerWavePerShuffle / BlockedXDLN_PerWarp; // BlockedLayout return tile_distribution_encoding< sequence<>, - tuple, - sequence>, + tuple, + sequence>, tuple>, tuple>, sequence<1, 2, 2>, @@ -622,22 +669,65 @@ struct CShuffleEpilogue auto o_lds_block = make_tensor_view( static_cast(p_smem), lds_block_desc); - auto in_lds_window = make_tile_window( - o_lds_block, - make_tuple(number{}, number{}), - {0, 0}, - LdsTileDistr); + auto in_lds_window = [&o_lds_block, &LdsTileDistr] { + if constexpr(std::is_same_v) + { + return make_tile_window( + o_lds_block, + make_tuple(number{}, number{}), + {0, 0}, + LdsTileDistr); + } + else if constexpr(std::is_same_v) + { + return make_tile_window( + o_lds_block, + make_tuple(number{}, number{}), + {0, 0}, + LdsTileDistr); + } + else + { + static_assert(false, "Unsupported ELayout!"); + } + }(); + //auto in_lds_window = make_tile_window( + // o_lds_block, + // make_tuple(number{}, number{}), + // {0, 0}, + // LdsTileDistr); - auto out_lds_window = make_tile_window( - o_lds_block, - make_tuple(number{}, number{}), - {0, 0}); + //auto out_lds_window = make_tile_window( + // o_lds_block, + // make_tuple(number{}, number{}), + // {0, 0}); + + auto out_lds_window = [&o_lds_block] { + if constexpr(std::is_same_v) + { + return make_tile_window( + o_lds_block, + make_tuple(number{}, number{}), + {0, 0}); + } + else if constexpr(std::is_same_v) + { + return make_tile_window( + o_lds_block, + make_tuple(number{}, number{}), + {0, 0}); + } + else + { + static_assert(false, "Unsupported ELayout!"); + } + }(); constexpr index_t num_access = SFC::get_num_of_access(); - - static_assert(std::is_same_v, - "Currently, the CShuffle Epilogue only supports the Row Major Output layout"); - + // TODO: Add support for Col Major Output Layout - CShuffle Epilogue + //static_assert(std::is_same_v, + // "Currently, the CShuffle Epilogue only supports the Row Major Output layout"); + static_assert(GetVectorSizeC() > 1, "VectorSizeC is not greater than 1!"); using TileEncodingPattern = tile_distribution_encoding_pattern_2d( e_ptr, - make_tuple(kargs.M, kargs.N), - make_tuple(1, kargs.stride_E), - number<1>{}, + make_tuple(kargs.N, kargs.M), + make_tuple(kargs.stride_E, 1), + number{}, number<1>{}); } }(); @@ -831,9 +831,9 @@ struct UniversalGemmKernel else { return pad_tensor_view(e_tensor_view, - make_tuple(number{}, - number{}), - sequence{}); + make_tuple(number{}, + number{}), + sequence{}); } }(); @@ -929,10 +929,25 @@ struct UniversalGemmKernel }, number{}); - auto e_block_window = make_tile_window( - e_pad_view, - make_tuple(number{}, number{}), - {i_m, i_n}); + + + const auto e_block_window = [&] () { + if constexpr(std::is_same_v) + { + return make_tile_window(e_pad_view, + make_tuple(number{}, + number{}), + {i_m, i_n}); + } + else + { + return make_tile_window(e_pad_view, + make_tuple(number{}, + number{}), + {i_n, i_m}); + } + }(); + return make_tuple(as_block_window, bs_block_window, ds_block_window, e_block_window); } @@ -986,7 +1001,19 @@ struct UniversalGemmKernel // Run Epilogue Pipeline auto& c_block_window = gemm_tile_windows.at(I3); + //if(threadIdx.x == 0) + //{ + // printf("CShuffleEpilogue operator() called! Before\n"); + // c_block_window.template print_tile_window_range(0, 4, 0, 8, "A"); + //} + EpiloguePipeline{}(c_block_window, c_block_tile, ds_block_window, smem_ptr_0); + + //if(threadIdx.x == 0) + //{ + // printf("CShuffleEpilogue operator() called! After\n"); + // c_block_window.template print_tile_window_range(0, 4, 0, 8, "A"); + // } } } diff --git a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp index 8ae7252908..88f25834b1 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp +++ b/test/ck_tile/gemm/test_gemm_pipeline_kernel_types.hpp @@ -78,46 +78,46 @@ using KernelTypesMemWmma = ::testing::Types< >; using KernelTypesCompV3 = ::testing::Types< - std::tuple< Row, Row, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Row, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Row, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Row, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Row, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Row, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Row, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Row, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Row, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Row, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Row, Col, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Row, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, - std::tuple< Col, Col, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3> + std::tuple< Row, Row, Col, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3> + //std::tuple< Row, Row, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Row, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Row, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Row, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Row, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Row, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Row, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Row, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Row, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Row, Col, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Row, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, F16, F16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, F16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, BF16, BF16, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, BF16, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, INT8, INT8, INT32, INT32, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, F8, F8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, F8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, F8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, BF8, BF8, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3>, + //std::tuple< Col, Col, Row, BF8, I4, F32, F16, I256, I256, I64, I32, I32, I16, Intrawave, CompV3> >; using KernelTypesCompV3Wmma = ::testing::Types< diff --git a/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc new file mode 100644 index 0000000000..a5cccfb9a7 --- /dev/null +++ b/test/ck_tile/gemm/test_gemm_pipeline_universal_run_test.inc @@ -0,0 +1,361 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024-2025, Advanced Micro Devices, Inc. All rights reserved. +#pragma once + +template +float gemm(const ck_tile::GemmHostArgs& args, const ck_tile::stream_config& s) + +{ + using GemmShape = ck_tile::TileGemmShape< + ck_tile::sequence, + ck_tile::sequence, + ck_tile:: + sequence, + GemmConfig::PermuteA, + GemmConfig::PermuteB>; + + using TilePartitioner = + ck_tile::GemmSpatiallyLocalTilePartitioner; + + using Traits = ck_tile::TileGemmTraits; + + using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits; + using GemmPipelineProblem = + ck_tile::GemmPipelineProblem; + + using BaseGemmPipeline = typename PipelineTypeTraits< + GemmConfig::Pipeline>::template UniversalGemmPipeline; + + const ck_tile::index_t k_grain = args.k_batch * GemmConfig::K_Tile; + const ck_tile::index_t K_split = (args.K + k_grain - 1) / k_grain * GemmConfig::K_Tile; + const ck_tile::index_t num_loop = TilePartitioner::GetLoopNum(K_split); + const bool has_hot_loop = BaseGemmPipeline::BlockHasHotloop(num_loop); + const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop); + + float ave_time{0}; + + const auto Run = [&](const auto has_hot_loop_, + const auto tail_number_, + const auto memory_operation_) { + constexpr bool has_hot_loop_v = has_hot_loop_.value; + constexpr auto tail_number_v = tail_number_.value; + constexpr auto scheduler = GemmConfig::Scheduler; + constexpr auto memory_operation = memory_operation_.value; + + using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; + + using GemmPipeline = typename PipelineTypeTraits< + GemmConfig::Pipeline>::template GemmPipeline; + using GemmEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem>; + using Kernel = ck_tile::GemmKernel; + auto kargs = Kernel::MakeKernelArgs(args); + + dim3 grids; + if constexpr(Persistent) + { + grids = Kernel::MaxOccupancyGridSize(s); + } + else + { + grids = Kernel::GridSize(args.M, args.N, args.k_batch); + } + const dim3 blocks = Kernel::BlockSize(); + + if(!Kernel::IsSupportedArgument(kargs)) + { + throw ArgumentsNotSupportedException( + "Wrong! Arguments not supported! Skipping gemm!\n"); + } + + if(s.log_level_ > 0) + { + std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n' + << "shape: " << GemmShape::GetName() << '\n' + << "problem: " << GemmPipelineProblem::GetName() << '\n' + << "pipeline: " << GemmPipeline::GetName() << '\n' + << "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}" + << ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}" + << std::endl; + } + if(s.flush_cache_) + { + std::cout << "Flushing cache..." << std::endl; + static constexpr ck_tile::index_t APackedSize = + std::is_same_v ? 2 : 1; + static constexpr ck_tile::index_t BPackedSize = + std::is_same_v ? 2 : 1; + + ck_tile::HostTensor a_m(ck_tile::host_tensor_descriptor( + args.M, args.K, args.stride_A, is_row_major(ALayout{}))); + ck_tile::HostTensor b_n(ck_tile::host_tensor_descriptor( + args.K, args.N, args.stride_B, is_row_major(BLayout{}))); + + auto size_a_buffer = a_m.get_element_space_size_in_bytes() / APackedSize; + auto size_b_buffer = b_n.get_element_space_size_in_bytes() / BPackedSize; + + ck_tile::RotatingMemWrapper rotating_mem( + kargs.as_ptr[0], kargs.bs_ptr[0], s.rotating_count_, size_a_buffer, size_b_buffer); + rotating_mem.Print(); + + auto run_flush_cache = [&]() { + // flush icache + ck_tile::flush_icache(); + // rotating mem + rotating_mem.Next(); + // clear c mem + if(args.k_batch > 1) + hipGetErrorString(hipMemsetAsync( + args.e_ptr, 0, args.M * args.N * sizeof(CDataType), s.stream_id_)); + }; + ave_time = ck_tile::launch_kernel_time_mask( + s, + run_flush_cache, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + } + else + { + ave_time = ck_tile::launch_kernel( + s, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + } + return ave_time; + }; + + const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { + //if(args.k_batch == 1) + //{ + Run(has_hot_loop_, + tail_number_, + ck_tile::integral_constant{}); + //} + //else + //{ + // Run(has_hot_loop_, + // tail_number_, + // ck_tile::integral_constant{}); + //} + }; + + BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); + return ave_time; +} + +template +bool run_gemm_test_prec_type(std::string a_layout, + std::string b_layout, + ck_tile::ArgParser& arg_parser) +{ + using Row = ck_tile::tensor_layout::gemm::RowMajor; + using Col = ck_tile::tensor_layout::gemm::ColumnMajor; + + if constexpr(std::is_same_v) + { + if(a_layout == "R" && b_layout == "C") + { + return run_gemm_test_with_layouts( + arg_parser, Row{}, Col{}, Row{}); + } + else if(a_layout == "C" && b_layout == "C") + { + return run_gemm_test_with_layouts( + arg_parser, Col{}, Col{}, Row{}); + } + else + { + throw std::runtime_error("Unsupported memory layout for the input matrices when " + "BPrecType is ck_tile::pk_int4_t!"); + } + } + else + { + if(a_layout == "R" && b_layout == "R") + { + return run_gemm_test_with_layouts( + arg_parser, Row{}, Row{}, Row{}); + } + else if(a_layout == "R" && b_layout == "C") + { + return run_gemm_test_with_layouts( + arg_parser, Row{}, Col{}, Row{}); + } + else if(a_layout == "C" && b_layout == "R") + { + return run_gemm_test_with_layouts( + arg_parser, Col{}, Row{}, Row{}); + } + else if(a_layout == "C" && b_layout == "C") + { + return run_gemm_test_with_layouts( + arg_parser, Col{}, Col{}, Row{}); + } + else + { + throw std::runtime_error("Unsupported memory layout for the input matrices!"); + } + } +} + +template +bool run_gemm_test(int argc, char* argv[]) +{ + auto [result, arg_parser] = create_args(argc, argv); + if(!result) + return false; + + std::string a_layout = arg_parser.get_str("a_layout"); + std::string b_layout = arg_parser.get_str("b_layout"); + + return run_gemm_test_prec_type( + a_layout, b_layout, arg_parser); +} + +template +int run_gemm_combinations() +{ + // Define possible values for each parameter + std::vector m_values = {"512", "1024"}; + std::vector n_values = {"512", "2048"}; + std::vector k_values = {"512", "1024"}; + + // We'll store all our arguments as strings first + std::vector arg_strings = {"./bin/tile_example_gemm_universal", + "", // m placeholder + "", // n placeholder + "", // k placeholder + "-stride_a=0", + "-stride_b=0", + "-stride_c=0", + "-v=2", + "-warmup=0", + "-repeat=1"}; + + // Create an array of const char pointers for argv + constexpr size_t ARG_COUNT = 10; + constexpr size_t ARG_MAX_LEN = 64; + char args[ARG_COUNT][ARG_MAX_LEN]; + char* argv[ARG_COUNT]; + + // Run all combinations + bool is_success = true; + for(const auto& m : m_values) + { + arg_strings[1] = "-m=" + m; + + for(const auto& n : n_values) + { + arg_strings[2] = "-n=" + n; + + for(const auto& k : k_values) + { + arg_strings[3] = "-k=" + k; + + // Set up the argv array with pointers to the string data + for(size_t i = 0; i < ARG_COUNT; i++) + { + strncpy(args[i], arg_strings[i].c_str(), ARG_MAX_LEN); + argv[i] = args[i]; + } + + std::cout << "Arguments received: "; + for(size_t i = 1; i < ARG_COUNT; ++i) + { + std::cout << argv[i] << " "; + } + std::cout << std::endl; + + // Call the function with the current configuration + try + { +#if CK_TILE_USE_WMMA + is_success = run_gemm_test, + APrecType, + BPrecType, + CPrecType>(ARG_COUNT, argv) && + is_success; +#else + is_success = run_gemm_test, + APrecType, + BPrecType, + CPrecType>(ARG_COUNT, argv) && + is_success; + is_success = run_gemm_test, + APrecType, + BPrecType, + CPrecType>(ARG_COUNT, argv) && + is_success; +#endif + } + catch(const ArgumentsNotSupportedException& e) + { + std::cerr << "Caught ArgumentsNotSupportedException: " << e.what() << '\n'; + // ArgumentsNotSupportedException is not an error. Do not change is_success + } + catch(const std::runtime_error& e) + { + std::cerr << "Caught runtime error: " << e.what() << '\n'; + is_success = false; + } + } + } + } + return is_success; +} diff --git a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc index 2c648eef23..ca9caa51ad 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc +++ b/test/ck_tile/gemm/test_gemm_pipeline_ut_cases.inc @@ -120,9 +120,9 @@ TYPED_TEST(TEST_SUITE_NAME, PaddK) TYPED_TEST(TEST_SUITE_NAME, Regular) { - std::vector Ms{512}; - constexpr int N = 1024; - constexpr int K = 512; + std::vector Ms{128}; + constexpr int N = 128; + constexpr int K = 128; for(int M : Ms) this->Run(M, N, K); diff --git a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp index f828150e01..e33abe19d1 100644 --- a/test/ck_tile/gemm/test_gemm_pipeline_util.hpp +++ b/test/ck_tile/gemm/test_gemm_pipeline_util.hpp @@ -174,6 +174,7 @@ class TestCkTileGemmPipeline : public ::testing::Test Persistent, NumWaveGroup, preshuffle>; + printf("[DEBUG] VectorSize_: %d\n", GemmUniversalTraits::_VectorSize); using GemmPipelineProblem = ck_tile::GemmPipelineProblem; @@ -224,6 +225,8 @@ class TestCkTileGemmPipeline : public ::testing::Test K_Warp_Tile, UniversalGemmProblem::TransposeC, memory_operation>>; + //GemmEpilogue::Problem::PrintInfo(); + //GemmEpilogue::PrintInfo(); using Kernel = ck_tile::GemmKernel; auto kargs = Kernel::MakeKernelArgs(args); @@ -256,20 +259,20 @@ class TestCkTileGemmPipeline : public ::testing::Test }; const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { - if(args.k_batch == 1) - { + //if(args.k_batch == 1) + //{ Run(has_hot_loop_, tail_number_, ck_tile::integral_constant{}); - } - else - { - Run(has_hot_loop_, - tail_number_, - ck_tile::integral_constant{}); - } + //} + //else + //{ + // Run(has_hot_loop_, + // tail_number_, + // ck_tile::integral_constant{}); + //} }; BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num); @@ -284,17 +287,17 @@ class TestCkTileGemmPipeline : public ::testing::Test { GTEST_SKIP() << "Unsupported data type combination for gemm pipeline test."; } - if constexpr(PipelineType == GemmPipelineType::CompV4 || - std::is_same_v) - { + //if constexpr(PipelineType == GemmPipelineType::CompV4 || + // std::is_same_v) + //{ // Only do k_batch = 1 when pipeline is CompV4, or BDataType is I4 - k_batches_ = {1}; - } - else - { - // Otherwise, use k_batch = 1 and 2 - k_batches_ = {1, 2}; - } + k_batches_ = {1}; + //} + //else + //{ + // // Otherwise, use k_batch = 1 and 2 + // k_batches_ = {1, 2}; + //} } template @@ -338,8 +341,15 @@ class TestCkTileGemmPipeline : public ::testing::Test ck_tile::HostTensor c_m_n_dev_result( ck_tile::host_tensor_descriptor(M, N, stride_C, is_row_major(CLayout{}))); - ck_tile::FillUniformDistributionIntegerValue{-5, 5, 11939}(a_m_k); - ck_tile::FillUniformDistributionIntegerValue{-5, 5, 11940}(b_k_n); + std::cout << "a_m_k: "; + a_m_k.print_first_n(std::cout) << '\n'; + std::cout << "b_k_n: "; + b_k_n.print_first_n(std::cout) << '\n'; + std::cout << "c_m_n_dev_result: "; + c_m_n_dev_result.print_first_n(std::cout) << '\n'; + + ck_tile::FillUniformDistributionIntegerValue{1, 2, 11939}(a_m_k); + ck_tile::FillUniformDistributionIntegerValue{1, 2, 11940}(b_k_n); ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes()); ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes()); @@ -372,7 +382,7 @@ class TestCkTileGemmPipeline : public ::testing::Test stride_B, stride_C}; - invoke_gemm(args, ck_tile::stream_config{nullptr, false}); + invoke_gemm(args, ck_tile::stream_config{nullptr, false, 2}); c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data()); bool pass = true; @@ -384,6 +394,13 @@ class TestCkTileGemmPipeline : public ::testing::Test ck_tile::reference_gemm( a_m_k, b_k_n, c_m_n_host_ref); + std::cout << "a_m_k: "; + a_m_k.print_first_n(std::cout) << '\n'; + std::cout << "b_k_n: "; + b_k_n.print_first_n(std::cout) << '\n'; + std::cout << "c_m_n_dev_result: "; + c_m_n_dev_result.print_first_n(std::cout) << '\n'; + const float max_accumulated_value = *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end()); const auto rtol_atol = calculate_rtol_atol(