Remove some oversubscriptions.

2026-07-03 05:37:34 +00:00 · 2025-07-10 15:42:10 +00:00
parent 66e4ee4962
commit 7bfe606b12
4 changed files with 222 additions and 185 deletions
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
@@ -553,18 +553,16 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
                const auto& c_grid_desc_m_n   = descs_initial[I2];
                const auto& block_2_ctile_map = GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n, M01, N01, k_batch_initial);

-                // Max occupancy is calculated for a batched GEMM kernel where the batch size corresponds to the number of convolution groups, i.e.,
-                // the max occupancy refers to how may simultaneous kernels processing Conv_G_ iGEMMs can simultaneously run on a single CU. 
-                // Hence, the grid is just size of the tile map, i.e., we should not include Conv_G_ to the grid size.
-                const auto grid_size = block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n);
+                const auto grid_size_mn = block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n);
                std::tie(m_dim_size_, n_dim_size_, k_dim_size_) = 
                    get_bwd_weight_gemm_sizes<NDimSpatial>(a_g_n_k_wos_lengths, e_g_k_c_xs_lengths);

                const auto k_grid_size = k_dim_size_ / K0PerBlock;

+                const auto total_grid_size = grid_size_mn * Conv_G_;
                k_batch_ = split_k_parameters.strategy_== SplitKStrategy::BestOccupancy
-                    ? get_best_occupancy_k_batch_value(max_occupancy.value_, grid_size)
-                    : get_optimized_k_batch_value(max_occupancy.value_, grid_size, k_grid_size);
+                    ? get_best_occupancy_k_batch_value(max_occupancy.value_, total_grid_size)
+                    : get_optimized_k_batch_value(max_occupancy.value_, grid_size_mn, k_grid_size);

                data_type_ = typeid(ABDataType).name();
                arithmetic_intensity_ = calculate_arithmetic_intensity(m_dim_size_, n_dim_size_, k_dim_size_, sizeof(ABDataType));
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp
@@ -523,16 +523,17 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3
                const auto gemmM = a_grid_desc_kbatch_k0_m_k1.GetLength(I1);
                const auto gemmN = b_grid_desc_kbatch_k0_n_k1.GetLength(I1);

-                // Max occupancy is calculated for a batched GEMM kernel where the batch size corresponds to the number of convolution groups.
-                // Hence, the grid is just size of the tile map.
-                const auto grid_size = GridwiseGemm::Block2CTileMap::CalculateGridSize(gemmM, gemmN);
+                const auto grid_size_mn = GridwiseGemm::Block2CTileMap::CalculateGridSize(gemmM, gemmN);
                std::tie(m_dim_size_, n_dim_size_, k_dim_size_) = 
                    get_bwd_weight_gemm_sizes<NDimSpatial>(a_g_n_k_wos_lengths, e_g_k_c_xs_lengths);
                const auto k_grid_size = k_dim_size_ / K0PerBlock;

+                // For V3 pipeline, it is beneficial to oversubscribe and consider the total grid size to be only 
+                // the grid of the GEMM output tiles.
+                const auto total_grid_size = grid_size_mn;
                k_batch_ = split_k_parameters.strategy_== SplitKStrategy::BestOccupancy
-                    ? get_best_occupancy_k_batch_value(max_occupancy.value_, grid_size)
-                    : get_optimized_k_batch_value(max_occupancy.value_, grid_size, k_grid_size);
+                    ? get_best_occupancy_k_batch_value(max_occupancy.value_, total_grid_size)
+                    : get_optimized_k_batch_value(max_occupancy.value_, grid_size_mn, k_grid_size);

                data_type_ = typeid(ABDataType).name();
                arithmetic_intensity_ = calculate_arithmetic_intensity(m_dim_size_, n_dim_size_, k_dim_size_, sizeof(ABDataType));
--- a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp
@@ -33,21 +33,39 @@ struct DeviceProperties
 inline ck::index_t get_best_occupancy_k_batch_value(int max_occupancy, ck::index_t grid_size)
 {
    static DeviceProperties device_properties;
-    const int num_cu = device_properties.num_cu_;
-    ck::index_t k_batch = 1;
+    const int max_capacity = max_occupancy * device_properties.num_cu_;

-    const auto optimal_split = static_cast<ck::index_t>(std::floor((1.0 *max_occupancy * num_cu) / (grid_size)));
+    // constexpr ck::index_t k_batch_max = 1024;
+    // const auto total_grid_size = grid_size_mn * num_conv_groups;
+    // ck::index_t k_batch = static_cast<ck::index_t>(max_capacity / std::gcd(max_capacity, total_grid_size));
+    // if (k_batch > k_batch_max || k_batch == 0)
+    // {
+    //   // TODO: This could be improved by using Euclidian algorithm to find the optimal k_batch.
+    //   auto min_remainder = max_capacity;
+    //   for (ck::index_t k = 1; k <= k_batch_max; ++k)
+    //   {
+    //     const auto remainder = (total_grid_size * k) % max_capacity;
+    //     // For equal remainder values, prefer smaller k values.
+    //     if (remainder < min_remainder)
+    //     {
+    //       min_remainder = remainder;
+    //       k_batch = k;
+    //     }
+    //   }
+    // }
+
+    ck::index_t k_batch = 1;
+    const auto optimal_split = static_cast<ck::index_t>(std::floor((1.0 * max_capacity) / (grid_size)));
    if (optimal_split > 1)
    {
      k_batch = optimal_split;
-    }
+    } 
    
    if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
    {
      std::cout << "[SPLIT-K AUTODEDUCE] Max active thread blocks per CU for GEMM kernel:  " << max_occupancy << std::endl;
      std::cout << "[SPLIT-K AUTODEDUCE] Output grid size:  " << grid_size << std::endl;
-      std::cout << "[SPLIT-K AUTODEDUCE] Optimal split value:  " << optimal_split << std::endl;
-      std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << " for K-batch."<< std::endl;
+      std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << std::endl;
    }
    return k_batch;
 }