From 70d6dcb230997b0a67110ac41447fa5f1807306d Mon Sep 17 00:00:00 2001 From: Muhammed Emin Ozturk Date: Wed, 18 Jun 2025 07:49:22 -0700 Subject: [PATCH] Stream-K CkProfiler Update ( Replace CPU Validation with GPU Validation and Add Dynamic Grid Size Calculation for Stream-K GEMM Profiler) (#2333) * Stream-K Ckprofiler Update * new grid list based on sm number * clang * update for review * Update profile_gemm_universal_streamk.cpp --------- Co-authored-by: root [ROCm/composable_kernel commit: bfb33bc1e92330f3d5710aad22bdadb174d3f282] --- .../profile_gemm_universal_streamk_impl.hpp | 111 +++++++++++++++--- .../src/profile_gemm_universal_streamk.cpp | 4 +- 2 files changed, 97 insertions(+), 18 deletions(-) mode change 100644 => 100755 profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp mode change 100644 => 100755 profiler/src/profile_gemm_universal_streamk.cpp diff --git a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp old mode 100644 new mode 100755 index e625fae808..640b192baf --- a/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp +++ b/profiler/include/profiler/profile_gemm_universal_streamk_impl.hpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" @@ -133,22 +134,62 @@ bool profile_gemm_universal_streamk_impl(int do_verification, // Run reference GEMM if(do_verification) { + // Use GPU validation + using ReferenceGemmInstanceGPU = + ck::tensor_operation::device::ReferenceGemm; - // Use CPU validation - // Note: GPU validation is not supported for fp8 !!! - using ReferenceGemmInstanceCPU = ck::tensor_operation::host::ReferenceGemm; - auto ref_gemm_cpu = ReferenceGemmInstanceCPU{}; - auto ref_invoker_cpu = ref_gemm_cpu.MakeInvoker(); - auto ref_argument_cpu = ref_gemm_cpu.MakeArgument( - a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); - ref_invoker_cpu.Run(ref_argument_cpu); + auto ref_gemm_gpu = ReferenceGemmInstanceGPU{}; + auto ref_invoker_gpu = ref_gemm_gpu.MakeInvoker(); + auto ref_argument_gpu = ref_gemm_gpu.MakeArgument( + static_cast(a_device_buf.GetDeviceBuffer()), + static_cast(b_device_buf.GetDeviceBuffer()), + static_cast(c_m_n_device_ref_buf.GetDeviceBuffer()), + M, + N, + K, + a_element_op, + b_element_op, + c_element_op); + + if(ref_gemm_gpu.IsSupportedArgument(&ref_argument_gpu)) + { + ref_invoker_gpu.Run(ref_argument_gpu, StreamConfig{nullptr, true}); + c_m_n_device_ref_buf.FromDevice(c_m_n_host_result.mData.data()); + } + else + { + std::cerr << "GPU reference GEMM does not support this problem configuration so does " + "CPU validation." + << std::endl; + + // Use CPU validation + + using ReferenceGemmInstanceCPU = + ck::tensor_operation::host::ReferenceGemm; + auto ref_gemm_cpu = ReferenceGemmInstanceCPU{}; + auto ref_invoker_cpu = ref_gemm_cpu.MakeInvoker(); + auto ref_argument_cpu = ref_gemm_cpu.MakeArgument( + a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op); + ref_invoker_cpu.Run(ref_argument_cpu); + } } std::string best_op_name; @@ -158,10 +199,48 @@ bool profile_gemm_universal_streamk_impl(int do_verification, float best_grid_size = 0; float best_streamk_sel = 0; + // Get number of SMs on the current GPU + int device_id; + hipError_t err = hipGetDevice(&device_id); + if(err != hipSuccess) + { + std::cerr << "hipGetDevice failed: " << hipGetErrorString(err) << std::endl; + return false; + } + + hipDeviceProp_t props; + err = hipGetDeviceProperties(&props, device_id); + if(err != hipSuccess) + { + std::cerr << "hipGetDeviceProperties failed: " << hipGetErrorString(err) << std::endl; + return false; + } + int num_sms = props.multiProcessorCount; + + // Generate grid sizes based on SM count with multipliers + std::vector multipliers = {0.2f, 0.4f, 0.6f, 0.8f, 1.0f, 1.2f, 1.4f, 1.6f, 2.0f}; + std::vector grid_size_list; + + for(float mult : multipliers) + { + int grid_size = static_cast(num_sms * mult); + if(grid_size > 0) + { + grid_size_list.push_back(grid_size); + } + } + + std::cout << "Number of SMs: " << num_sms << std::endl; + std::cout << "Grid sizes to test: "; + for(auto gs : grid_size_list) + { + std::cout << gs << " "; + } + std::cout << std::endl; + // profile device GEMM instances for(auto& op_ptr : op_ptrs) { - std::vector grid_size_list = {38, 76, 114, 152, 190, 228, 266, 304, 342, 380}; std::vector streamk_sel_list = { 0, 1, 2, 3, 4}; // 0: Data Parallel (DP) mode (Stream-K OFF), 1: 1-tile Stream-K+ DP, // 2:2-tile Stream-K + DP diff --git a/profiler/src/profile_gemm_universal_streamk.cpp b/profiler/src/profile_gemm_universal_streamk.cpp old mode 100644 new mode 100755 index 4d1ab811ee..40ae0d70f5 --- a/profiler/src/profile_gemm_universal_streamk.cpp +++ b/profiler/src/profile_gemm_universal_streamk.cpp @@ -90,8 +90,8 @@ int profile_gemm_universal_streamk(int argc, char* argv[]) const int Streamk_sel = std::stoi(argv[14]); const int Grid_size = std::stoi(argv[15]); - int n_warmup = 20; - int n_iter = 50; + int n_warmup = 1; + int n_iter = 10; uint64_t rotating = 0; if(argc == 19) {