From 8e965d07ab3c8dd7f140bbd7be69fc08716df896 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <>
Date: Thu, 12 Jun 2025 09:41:17 +0000
Subject: [PATCH] Fix a bug in occupancy estimation.

---
 ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp |  4 +-
 .../gpu/device/impl/split_k_utils.hpp         | 54 +++++++++++--------
 .../profile_grouped_conv_bwd_weight_impl.hpp  |  1 +
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
index 0b8cc1b0e9..715c4cfce3 100644
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -427,10 +427,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
     {
         MaximumActiveBlocksPerMultiprocessor()
         {
-            constexpr size_t dynSharedMemPerBlk = GridwiseGemm::GetSharedMemoryNumberOfByte();
+            constexpr size_t dynSharedMemPerBlk = 0;
+            constexpr size_t ldsMemPerBlk = GridwiseGemm::GetSharedMemoryNumberOfByte();
             if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
             {
                 std::cout << "[SPLIT-K AUTODEDUCE] Dynamic shared memory per block: " << dynSharedMemPerBlk << " bytes" << std::endl;
+                std::cout << "[SPLIT-K AUTODEDUCE] LDS memory per block: " << ldsMemPerBlk << " bytes" << std::endl;
             }
             int max_occupancy = 0;
             hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(
diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
index 8fe22afd50..06b370daa1 100644
--- a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
@@ -30,41 +30,53 @@ struct DeviceProperties
 inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size, ck::index_t K_size, ck::index_t conv_G /*, ck::index_t multiplier*/)
 {
     static DeviceProperties device_properties;
-    constexpr ck::index_t k_batch_min = 1;
-    constexpr ck::index_t batch_size_min = 512;
+    //constexpr ck::index_t default_batch_size = 512;
+    //constexpr ck::index_t min_batch_size = 8192;
 
     const int num_cu = device_properties.num_cu_;
-    const auto k_batch_max = math::integer_divide_ceil(K_size, batch_size_min);
-    // Ensure that we do not exceed the maximum capacity. This would lead to wave quantization.
-    const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (1.0 * grid_size * conv_G)));
+    // auto target_batch_size = static_cast<ck::index_t>(ck::EnvValue(CK_ENV(CK_SPLIT_K_BATCH_SIZE)));
+    // if (target_batch_size < min_batch_size)
+    // {
+    //   target_batch_size = default_batch_size;
+    // }
+
+    // The optimal split is an integer multiple of (max_occupancy * num_cu) / (1.0 * grid_size * conv_G).
+    // Here we take the integer to be conv_G, i.e., the number of groups.
+    // The number is floored to ensure that we do not exceed the maximum capacity of compute units, i.e, 
+    // we prefer to (N-eps) * max_capacity rather than (N+eps) * max_capacity because the latter leads to
+    // using only eps fraction of capacity on the last wave.
+    // const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (1.0 * grid_size)));
+    // auto k_batch = 1;
+    // if (optimal_split > 0 && K_size > target_batch_size)
+    // {
+    //   //The optimal value of k_batch is a multiple of the optimal_split.
+    //   //We need to find the optimal number K values per batch - this gives the optimal k_batch value.
+    //   k_batch = optimal_split;
+    //   const auto current_batch_size = math::integer_divide_ceil(K_size, k_batch);
+    //   if (current_batch_size > target_batch_size)
+    //   {
+    //     // If the current batch size is larger than the target batch size, we need to increase k_batch.
+    //     const ck::index_t multiplier = std::max(1, math::integer_divide_ceil(K_size, target_batch_size * optimal_split));
+    //     k_batch = optimal_split * multiplier;
+    //   }
+    // }
+
     auto k_batch = 1;
+    constexpr ck::index_t num_waves = 1;
+    const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (num_waves * grid_size * conv_G)));
     if (optimal_split > 1)
     {
-      //The optimal value of k_batch is a multiple of the optimal_split.
-      //We need to find the optimal number K values per batch - this gives the optimal k_batch value.
-      auto target_batch_size = static_cast<ck::index_t>(ck::EnvValue(CK_ENV(CK_SPLIT_K_BATCH_SIZE)));
-      if (target_batch_size < k_batch_min)
-      {
-        target_batch_size = k_batch_min;
-      }
       k_batch = optimal_split;
-      const auto current_batch_size = math::integer_divide_ceil(K_size, k_batch);
-      if (current_batch_size > target_batch_size)
-      {
-        // If the current batch size is larger than the target batch size, we need to increase k_batch.
-        const ck::index_t multiplier = std::max(1, math::integer_divide_ceil(K_size, target_batch_size * optimal_split));
-        k_batch = optimal_split * multiplier;
-      }
     }
     
     if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
     {
       std::cout << "[SPLIT-K AUTODEDUCE] Max active thread blocks per CU for GEMM kernel:  " << max_occupancy << std::endl;
-      std::cout << "[SPLIT-K AUTODEDUCE] Output grid size (M tiles x N tiles x Conv groups):  " << grid_size << std::endl;
+      std::cout << "[SPLIT-K AUTODEDUCE] Output grid size (M tiles x N tiles):  " << grid_size << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] K-dim size:  " << K_size << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] Conv groups:  " << conv_G << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] Optimal split value:  " << optimal_split << std::endl;
-      std::cout << "[SPLIT-K AUTODEDUCE] Maximum k_batch value:  " << k_batch_max << std::endl;
+      //std::cout << "[SPLIT-K AUTODEDUCE] Target batch size:  " << target_batch_size << std::endl;
       std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << " for K-batch."<< std::endl;
     }
     return k_batch;
diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
index 4941ae5e3a..de9fa3820a 100644
--- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
+++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp
@@ -249,6 +249,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
     float max_accumulated_value = 0;
     if(do_verification)
     {
+        std::cout << "Running reference implementation for verification..." << std::endl;
         auto ref_conv     = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
                                                                            InDataType,
                                                                            WeiDataType,