From fcf50b211df28fb89f8327706008d0b3977451fc Mon Sep 17 00:00:00 2001
From: Astha Rai <astha.rai713@gmail.com>
Date: Tue, 17 Jun 2025 23:41:56 +0000
Subject: [PATCH] changes to profiler for streamk

---
 .../profile_gemm_universal_streamk_impl.hpp   | 222 +++++++-----------
 .../src/profile_gemm_universal_streamk.cpp    |  28 +--
 2 files changed, 103 insertions(+), 147 deletions(-)
 mode change 100755 => 100644 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
 mode change 100755 => 100644 profiler/src/profile_gemm_universal_streamk.cpp
diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
old mode 100755
new mode 100644
index e625fae808..a2a2c60461
--- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
+++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp
@@ -44,11 +44,10 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
                                          int StrideA,
                                          int StrideB,
                                          int StrideC,
-                                         int Streamk_sel,
-                                         int Grid_size,
                                          int n_warmup,
                                          int n_iter,
-                                         uint64_t rotating = 0)
+                                         uint64_t rotating    = 0,
+                                         uint32_t NumSKBlocks = 0xffffffff)
 {
     bool pass = true;
 
@@ -152,144 +151,105 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     }
 
     std::string best_op_name;
-    float best_ave_time    = 0;
-    float best_tflops      = 0;
-    float best_gb_per_sec  = 0;
-    float best_grid_size   = 0;
-    float best_streamk_sel = 0;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
 
     // profile device GEMM instances
     for(auto& op_ptr : op_ptrs)
     {
-        std::vector<int> grid_size_list   = {38, 76, 114, 152, 190, 228, 266, 304, 342, 380};
-        std::vector<int> streamk_sel_list = {
-            0, 1, 2, 3, 4}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile Stream-K+ DP,
-                            // 2:2-tile Stream-K + DP
+        auto argument_ptr =
+            op_ptr->MakeArgumentPointer(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                        static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                        M,
+                                        N,
+                                        K,
+                                        StrideA,
+                                        StrideB,
+                                        StrideC,
+                                        a_element_op,
+                                        b_element_op,
+                                        c_element_op,
+                                        NumSKBlocks); // NumSKBlocks parameter
 
-        if(Grid_size == -1)
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
         {
-            grid_size_list = {Grid_size};
-        }
-        if(Streamk_sel != -1)
-        {
-            streamk_sel_list = {Streamk_sel};
-        }
-        for(std::size_t j = 0; j < streamk_sel_list.size(); j++)
-        {
-            for(std::size_t i = 0; i < grid_size_list.size(); i++)
+
+            // re-init C to zero before profiling next kernel
+            c_device_buf.SetZero();
+
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false, 0, n_warmup, n_iter});
+
+            if(do_verification)
             {
-                auto grid_size_curr      = grid_size_list[i];
-                index_t streamk_sel_curr = streamk_sel_list[j];
-                printf("streamk_sel_curr=%0d\n", streamk_sel_curr);
-                auto argument_ptr = op_ptr->MakeArgumentPointer(
-                    static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
-                    static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
-                    static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                    M,
-                    N,
-                    K,
-                    StrideA,
-                    StrideB,
-                    StrideC,
-                    streamk_sel_curr,
-                    grid_size_curr,
-                    a_element_op,
-                    b_element_op,
-                    c_element_op);
+                c_device_buf.FromDevice(c_m_n_device_result.mData.data());
 
-                auto invoker_ptr = op_ptr->MakeInvokerPointer();
+                // Always compare against CPU reference results computed earlier
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
 
-                if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+                if(do_log)
                 {
-
-                    // re-init C to zero before profiling next kernel
-                    c_device_buf.SetZero();
-
-                    invoker_ptr->Run(argument_ptr.get(),
-                                     StreamConfig{nullptr, false, 0, n_warmup, n_iter});
-
-                    if(do_verification)
-                    {
-                        c_device_buf.FromDevice(c_m_n_device_result.mData.data());
-
-                        // Always compare against CPU reference results computed earlier
-                        pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-
-                        if(do_log)
-                        {
-                            LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(
-                                std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
-                                << std::endl;
-                            LogRangeAsType<float>(
-                                std::cout << "c_device: ", c_m_n_device_result.mData, ",")
-                                << std::endl;
-                        }
-                    }
-
-                    std::string op_name = op_ptr->GetTypeString();
-
-                    float ave_time = invoker_ptr->Run(argument_ptr.get(),
-                                                      StreamConfig{nullptr,
-                                                                   time_kernel,
-                                                                   0,
-                                                                   n_warmup,
-                                                                   n_iter,
-                                                                   rotating_count > 1,
-                                                                   rotating_count});
-
-                    std::size_t flop = std::size_t(2) * M * N * K;
-
-                    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
-                                            sizeof(CDataType) * M * N;
-
-                    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
-
-                    float gb_per_sec = num_btype / 1.E6 / ave_time;
-
-                    std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops
-                              << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", Grid_size "
-                              << grid_size_curr << ", streamk selection strategy"
-                              << streamk_sel_curr << std::endl;
-
-#if defined CK_ENABLE_FP8
-                    // set softer tolerances for fp8
-                    if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
-                                 is_same_v<CDataType, f8_t>)
-                    {
-                        std::string msg = "Error: Incorrect results!";
-                        double rtol     = 1e-1;
-                        double atol     = 1e-1;
-                        pass            = pass & ck::utils::check_err(
-                                          c_m_n_device_result, c_m_n_host_result, msg, rtol, atol);
-                    }
-                    else
-                    {
-#endif
-                        pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
-#if defined CK_ENABLE_FP8
-                    }
-#endif
-
-                    if(tflops > best_tflops)
-                    {
-                        best_op_name     = op_name;
-                        best_tflops      = tflops;
-                        best_ave_time    = ave_time;
-                        best_gb_per_sec  = gb_per_sec;
-                        best_grid_size   = grid_size_curr;
-                        best_streamk_sel = streamk_sel_curr;
-                    }
-                }
-                else
-                {
-                    std::cout << op_ptr->GetTypeString() << " does not support this problem"
-                              << std::endl;
+                    LogRangeAsType<float>(std::cout << "a : ", a_m_k.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "b: ", b_k_n.mData, ",") << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_host  : ", c_m_n_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "c_device: ", c_m_n_device_result.mData, ",")
+                        << std::endl;
                 }
             }
+
+            std::string op_name = op_ptr->GetTypeString();
+
+            float ave_time = invoker_ptr->Run(
+                argument_ptr.get(),
+                StreamConfig{
+                    nullptr, time_kernel, 0, n_warmup, n_iter, rotating_count > 1, rotating_count});
+
+            std::size_t flop = std::size_t(2) * M * N * K;
+
+            std::size_t num_btype =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << tflops << " TFlops, "
+                      << gb_per_sec << " GB/s, " << op_name << std::endl;
+
+#if defined CK_ENABLE_FP8
+            // set softer tolerances for fp8
+            if constexpr(is_same_v<ADataType, f8_t> || is_same_v<BDataType, f8_t> ||
+                         is_same_v<CDataType, f8_t>)
+            {
+                std::string msg = "Error: Incorrect results!";
+                double rtol     = 1e-1;
+                double atol     = 1e-1;
+                pass            = pass & ck::utils::check_err(
+                                  c_m_n_device_result, c_m_n_host_result, msg, rtol, atol);
+            }
+            else
+            {
+#endif
+                pass = pass & ck::utils::check_err(c_m_n_device_result, c_m_n_host_result);
+#if defined CK_ENABLE_FP8
+            }
+#endif
+
+            if(tflops > best_tflops)
+            {
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_ptr->GetTypeString() << " does not support this problem" << std::endl;
         }
     }
 
@@ -329,9 +289,7 @@ bool profile_gemm_universal_streamk_impl(int do_verification,
     }
 
     std::cout << " M = " << M << " N = " << N << " K = " << K << " StrideA = " << StrideA
-              << " StrideB = " << StrideB << " StrideC = " << StrideC
-              << " Grid_size = " << best_grid_size
-              << " Stream-K selection strategy = " << best_streamk_sel << " : " << best_ave_time
+              << " StrideB = " << StrideB << " StrideC = " << StrideC << "Time : " << best_ave_time
               << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, "
               << best_op_name << std::endl;
 
diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp
old mode 100755
new mode 100644
index 4d1ab811ee..fb5ae086f4
--- a/profiler/src/profile_gemm_universal_streamk.cpp
+++ b/profiler/src/profile_gemm_universal_streamk.cpp
@@ -34,7 +34,7 @@ enum struct GemmDataType
 
 int profile_gemm_universal_streamk(int argc, char* argv[])
 {
-    if(argc != 16 && argc != 19)
+    if(argc != 14 && argc != 18)
     {
         printf("arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n");
         printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8; 4: f8@f16; 5: f16@f8; 6: f16, "
@@ -48,12 +48,11 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
         printf("arg6: print tensor value (0: no; 1: yes)\n");
         printf("arg7: time kernel (0=no, 1=yes)\n");
         printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideC\n");
-        printf("arg14: Stream-k select strategy 0: all DP, 1: 1-tile SK, 2: 2-tile SK\n");
-        printf("arg15: Grid-size, -1 for max persistent kernel occupancy\n");
         printf("optional:\n");
-        printf("arg16: number of warm-up cycles (default 1)\n");
-        printf("arg17: number of iterations (default 10)\n");
-        printf("arg18: memory for rotating buffer (default 0, size in MB)\n");
+        printf("arg14: number of warm-up cycles (default 1)\n");
+        printf("arg15: number of iterations (default 10)\n");
+        printf("arg16: memory for rotating buffer (default 0, size in MB)\n");
+        printf("arg17: NumSKBlocks\n");
         exit(1);
     }
 
@@ -86,18 +85,18 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
 
     const int K = std::stoi(argv[10]);
 
-    const int StrideC     = std::stoi(argv[13]);
-    const int Streamk_sel = std::stoi(argv[14]);
-    const int Grid_size   = std::stoi(argv[15]);
+    const int StrideC = std::stoi(argv[13]);
+    const uint32_t NumSKBlocks =
+        argc >= 18 ? static_cast<uint32_t>(std::stoul(std::string(argv[17]))) : 0xffffffff;
 
     int n_warmup      = 20;
     int n_iter        = 50;
     uint64_t rotating = 0;
-    if(argc == 19)
+    if(argc == 17)
     {
-        n_warmup = std::stoi(argv[16]);
-        n_iter   = std::stoi(argv[17]);
-        rotating = std::stoull(argv[18]) * 1024 * 1024;
+        n_warmup = std::stoi(argv[14]);
+        n_iter   = std::stoi(argv[15]);
+        rotating = std::stoull(argv[16]) * 1024 * 1024;
     }
 
     using F32  = float;
@@ -151,8 +150,7 @@ int profile_gemm_universal_streamk(int argc, char* argv[])
             (StrideA < 0) ? DefaultStrideA : StrideA,
             (StrideB < 0) ? DefaultStrideB : StrideB,
             (StrideC < 0) ? DefaultStrideC : StrideC,
-            Streamk_sel,
-            Grid_size,
+            NumSKBlocks,
             n_warmup,
             n_iter,
             rotating);