From fcf50b211df28fb89f8327706008d0b3977451fc Mon Sep 17 00:00:00 2001 From: Astha Rai Date: Tue, 17 Jun 2025 23:41:56 +0000 Subject: [PATCH] changes to profiler for streamk --- .../profile_gemm_universal_streamk_impl.hpp | 222 +++++++----------- .../src/profile_gemm_universal_streamk.cpp | 28 +-- 2 files changed, 103 insertions(+), 147 deletions(-) mode change 100755 => 100644 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp mode change 100755 => 100644 profiler/src/profile_gemm_universal_streamk.cpp diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp old mode 100755 new mode 100644 index e625fae808..a2a2c60461 --- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp @@ -44,11 +44,10 @@ bool profile_gemm_universal_streamk_impl(int do_verification, int StrideA, int StrideB, int StrideC, - int Streamk_sel, - int Grid_size, int n_warmup, int n_iter, - uint64_t rotating = 0) + uint64_t rotating = 0, + uint32_t NumSKBlocks = 0xffffffff) { bool pass = true; @@ -152,144 +151,105 @@ bool profile_gemm_universal_streamk_impl(int do_verification, } std::string best_op_name; - float best_ave_time = 0; - float best_tflops = 0; - float best_gb_per_sec = 0; - float best_grid_size = 0; - float best_streamk_sel = 0; + float best_ave_time = 0; + float best_tflops = 0; + float best_gb_per_sec = 0; // profile device GEMM instances for(auto& op_ptr : op_ptrs) { - std::vector grid_size_list = {38, 76, 114, 152, 190, 228, 266, 304, 342, 380}; - std::vector streamk_sel_list = { - 0, 1, 2, 3, 4}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile Stream-K+ DP, - // 2:2-tile Stream-K + DP + auto argument_ptr = + op_ptr->MakeArgumentPointer(static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b_device_buf.GetDeviceBuffer()), + static_cast(c_device_buf.GetDeviceBuffer()), + M, + N, + K, + StrideA, + StrideB, + StrideC, + a_element_op, + b_element_op, + c_element_op, + NumSKBlocks); // NumSKBlocks parameter - if(Grid_size == -1) + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) { - grid_size_list = {Grid_size}; - } - if(Streamk_sel != -1) - { - streamk_sel_list = {Streamk_sel}; - } - for(std::size_t j = 0; j < streamk_sel_list.size(); j++) - { - for(std::size_t i = 0; i < grid_size_list.size(); i++) + + // re-init C to zero before profiling next kernel + c_device_buf.SetZero(); + + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false, 0, n_warmup, n_iter}); + + if(do_verification) { - auto grid_size_curr = grid_size_list[i]; - index_t streamk_sel_curr = streamk_sel_list[j]; - printf("streamk_sel_curr=%0d\n", streamk_sel_curr); - auto argument_ptr = op_ptr->MakeArgumentPointer( - static_cast(a_device_buf.GetDeviceBuffer()), - static_cast(b_device_buf.GetDeviceBuffer()), - static_cast(c_device_buf.GetDeviceBuffer()), - M, - N, - K, - StrideA, - StrideB, - StrideC, - streamk_sel_curr, - grid_size_curr, - a_element_op, - b_element_op, - c_element_op); + c_device_buf.FromDevice(c_m_n_device_result.mData.data()); - auto invoker_ptr = op_ptr->MakeInvokerPointer(); + // Always compare against CPU reference results computed earlier + pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); - if(op_ptr->IsSupportedArgument(argument_ptr.get())) + if(do_log) { - - // re-init C to zero before profiling next kernel - c_device_buf.SetZero(); - - invoker_ptr->Run(argument_ptr.get(), - StreamConfig{nullptr, false, 0, n_warmup, n_iter}); - - if(do_verification) - { - c_device_buf.FromDevice(c_m_n_device_result.mData.data()); - - // Always compare against CPU reference results computed earlier - pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); - - if(do_log) - { - LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") - << std::endl; - LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "c_host : ", c_m_n_host_result.mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "c_device: ", c_m_n_device_result.mData, ",") - << std::endl; - } - } - - std::string op_name = op_ptr->GetTypeString(); - - float ave_time = invoker_ptr->Run(argument_ptr.get(), - StreamConfig{nullptr, - time_kernel, - 0, - n_warmup, - n_iter, - rotating_count > 1, - rotating_count}); - - std::size_t flop = std::size_t(2) * M * N * K; - - std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + - sizeof(CDataType) * M * N; - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops - << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", Grid_size " - << grid_size_curr << ", streamk selection strategy" - << streamk_sel_curr << std::endl; - -#if defined CK_ENABLE_FP8 - // set softer tolerances for fp8 - if constexpr(is_same_v || is_same_v || - is_same_v) - { - std::string msg = "Error: Incorrect results!"; - double rtol = 1e-1; - double atol = 1e-1; - pass = pass & ck::utils::check_err( - c_m_n_device_result, c_m_n_host_result, msg, rtol, atol); - } - else - { -#endif - pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); -#if defined CK_ENABLE_FP8 - } -#endif - - if(tflops > best_tflops) - { - best_op_name = op_name; - best_tflops = tflops; - best_ave_time = ave_time; - best_gb_per_sec = gb_per_sec; - best_grid_size = grid_size_curr; - best_streamk_sel = streamk_sel_curr; - } - } - else - { - std::cout << op_ptr->GetTypeString() << " does not support this problem" - << std::endl; + LogRangeAsType(std::cout << "a : ", a_m_k.mData, ",") << std::endl; + LogRangeAsType(std::cout << "b: ", b_k_n.mData, ",") << std::endl; + LogRangeAsType(std::cout << "c_host : ", c_m_n_host_result.mData, ",") + << std::endl; + LogRangeAsType(std::cout << "c_device: ", c_m_n_device_result.mData, ",") + << std::endl; } } + + std::string op_name = op_ptr->GetTypeString(); + + float ave_time = invoker_ptr->Run( + argument_ptr.get(), + StreamConfig{ + nullptr, time_kernel, 0, n_warmup, n_iter, rotating_count > 1, rotating_count}); + + std::size_t flop = std::size_t(2) * M * N * K; + + std::size_t num_btype = + sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N; + + float tflops = static_cast(flop) / 1.E9 / ave_time; + + float gb_per_sec = num_btype / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + +#if defined CK_ENABLE_FP8 + // set softer tolerances for fp8 + if constexpr(is_same_v || is_same_v || + is_same_v) + { + std::string msg = "Error: Incorrect results!"; + double rtol = 1e-1; + double atol = 1e-1; + pass = pass & ck::utils::check_err( + c_m_n_device_result, c_m_n_host_result, msg, rtol, atol); + } + else + { +#endif + pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result); +#if defined CK_ENABLE_FP8 + } +#endif + + if(tflops > best_tflops) + { + best_op_name = op_name; + best_tflops = tflops; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl; } } @@ -329,9 +289,7 @@ bool profile_gemm_universal_streamk_impl(int do_verification, } std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA - << " StrideB = " << StrideB << " StrideC = " << StrideC - << " Grid_size = " << best_grid_size - << " Stream-K selection strategy = " << best_streamk_sel << " : " << best_ave_time + << " StrideB = " << StrideB << " StrideC = " << StrideC << "Time : " << best_ave_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp old mode 100755 new mode 100644 index 4d1ab811ee..fb5ae086f4 --- a/profiler/src/profile_gemm_universal_streamk.cpp +++ b/profiler/src/profile_gemm_universal_streamk.cpp @@ -34,7 +34,7 @@ enum struct GemmDataType int profile_gemm_universal_streamk(int argc, char* argv[]) { - if(argc != 16 && argc != 19) + if(argc != 14 && argc != 18) { printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"); printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16, " @@ -48,12 +48,11 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) printf("arg6: print tensor value (0: no; 1: yes)\n"); printf("arg7: time kernel (0=no, 1=yes)\n"); printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n"); - printf("arg14: Stream-k select strategy 0: all DP, 1: 1-tile SK, 2: 2-tile SK\n"); - printf("arg15: Grid-size, -1 for max persistent kernel occupancy\n"); printf("optional:\n"); - printf("arg16: number of warm-up cycles (default 1)\n"); - printf("arg17: number of iterations (default 10)\n"); - printf("arg18: memory for rotating buffer (default 0, size in MB)\n"); + printf("arg14: number of warm-up cycles (default 1)\n"); + printf("arg15: number of iterations (default 10)\n"); + printf("arg16: memory for rotating buffer (default 0, size in MB)\n"); + printf("arg17: NumSKBlocks\n"); exit(1); } @@ -86,18 +85,18 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) const int K = std::stoi(argv[10]); - const int StrideC = std::stoi(argv[13]); - const int Streamk_sel = std::stoi(argv[14]); - const int Grid_size = std::stoi(argv[15]); + const int StrideC = std::stoi(argv[13]); + const uint32_t NumSKBlocks = + argc >= 18 ? static_cast(std::stoul(std::string(argv[17]))) : 0xffffffff; int n_warmup = 20; int n_iter = 50; uint64_t rotating = 0; - if(argc == 19) + if(argc == 17) { - n_warmup = std::stoi(argv[16]); - n_iter = std::stoi(argv[17]); - rotating = std::stoull(argv[18]) * 1024 * 1024; + n_warmup = std::stoi(argv[14]); + n_iter = std::stoi(argv[15]); + rotating = std::stoull(argv[16]) * 1024 * 1024; } using F32 = float; @@ -151,8 +150,7 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) (StrideA < 0) ? DefaultStrideA : StrideA, (StrideB < 0) ? DefaultStrideB : StrideB, (StrideC < 0) ? DefaultStrideC : StrideC, - Streamk_sel, - Grid_size, + NumSKBlocks, n_warmup, n_iter, rotating);