From 8e965d07ab3c8dd7f140bbd7be69fc08716df896 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Pietil=C3=A4?= <> Date: Thu, 12 Jun 2025 09:41:17 +0000 Subject: [PATCH] Fix a bug in occupancy estimation. --- ...e_grouped_conv_bwd_weight_xdl_cshuffle.hpp | 4 +- .../gpu/device/impl/split_k_utils.hpp | 54 +++++++++++-------- .../profile_grouped_conv_bwd_weight_impl.hpp | 1 + 3 files changed, 37 insertions(+), 22 deletions(-) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp index 0b8cc1b0e9..715c4cfce3 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp @@ -427,10 +427,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle { MaximumActiveBlocksPerMultiprocessor() { - constexpr size_t dynSharedMemPerBlk = GridwiseGemm::GetSharedMemoryNumberOfByte(); + constexpr size_t dynSharedMemPerBlk = 0; + constexpr size_t ldsMemPerBlk = GridwiseGemm::GetSharedMemoryNumberOfByte(); if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "[SPLIT-K AUTODEDUCE] Dynamic shared memory per block: " << dynSharedMemPerBlk << " bytes" << std::endl; + std::cout << "[SPLIT-K AUTODEDUCE] LDS memory per block: " << ldsMemPerBlk << " bytes" << std::endl; } int max_occupancy = 0; hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor( diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp index 8fe22afd50..06b370daa1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp @@ -30,41 +30,53 @@ struct DeviceProperties inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size, ck::index_t K_size, ck::index_t conv_G /*, ck::index_t multiplier*/) { static DeviceProperties device_properties; - constexpr ck::index_t k_batch_min = 1; - constexpr ck::index_t batch_size_min = 512; + //constexpr ck::index_t default_batch_size = 512; + //constexpr ck::index_t min_batch_size = 8192; const int num_cu = device_properties.num_cu_; - const auto k_batch_max = math::integer_divide_ceil(K_size, batch_size_min); - // Ensure that we do not exceed the maximum capacity. This would lead to wave quantization. - const auto optimal_split = static_cast(std::floor((max_occupancy * num_cu) / (1.0 * grid_size * conv_G))); + // auto target_batch_size = static_cast(ck::EnvValue(CK_ENV(CK_SPLIT_K_BATCH_SIZE))); + // if (target_batch_size < min_batch_size) + // { + // target_batch_size = default_batch_size; + // } + + // The optimal split is an integer multiple of (max_occupancy * num_cu) / (1.0 * grid_size * conv_G). + // Here we take the integer to be conv_G, i.e., the number of groups. + // The number is floored to ensure that we do not exceed the maximum capacity of compute units, i.e, + // we prefer to (N-eps) * max_capacity rather than (N+eps) * max_capacity because the latter leads to + // using only eps fraction of capacity on the last wave. + // const auto optimal_split = static_cast(std::floor((max_occupancy * num_cu) / (1.0 * grid_size))); + // auto k_batch = 1; + // if (optimal_split > 0 && K_size > target_batch_size) + // { + // //The optimal value of k_batch is a multiple of the optimal_split. + // //We need to find the optimal number K values per batch - this gives the optimal k_batch value. + // k_batch = optimal_split; + // const auto current_batch_size = math::integer_divide_ceil(K_size, k_batch); + // if (current_batch_size > target_batch_size) + // { + // // If the current batch size is larger than the target batch size, we need to increase k_batch. + // const ck::index_t multiplier = std::max(1, math::integer_divide_ceil(K_size, target_batch_size * optimal_split)); + // k_batch = optimal_split * multiplier; + // } + // } + auto k_batch = 1; + constexpr ck::index_t num_waves = 1; + const auto optimal_split = static_cast(std::floor((max_occupancy * num_cu) / (num_waves * grid_size * conv_G))); if (optimal_split > 1) { - //The optimal value of k_batch is a multiple of the optimal_split. - //We need to find the optimal number K values per batch - this gives the optimal k_batch value. - auto target_batch_size = static_cast(ck::EnvValue(CK_ENV(CK_SPLIT_K_BATCH_SIZE))); - if (target_batch_size < k_batch_min) - { - target_batch_size = k_batch_min; - } k_batch = optimal_split; - const auto current_batch_size = math::integer_divide_ceil(K_size, k_batch); - if (current_batch_size > target_batch_size) - { - // If the current batch size is larger than the target batch size, we need to increase k_batch. - const ck::index_t multiplier = std::max(1, math::integer_divide_ceil(K_size, target_batch_size * optimal_split)); - k_batch = optimal_split * multiplier; - } } if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) { std::cout << "[SPLIT-K AUTODEDUCE] Max active thread blocks per CU for GEMM kernel: " << max_occupancy << std::endl; - std::cout << "[SPLIT-K AUTODEDUCE] Output grid size (M tiles x N tiles x Conv groups): " << grid_size << std::endl; + std::cout << "[SPLIT-K AUTODEDUCE] Output grid size (M tiles x N tiles): " << grid_size << std::endl; std::cout << "[SPLIT-K AUTODEDUCE] K-dim size: " << K_size << std::endl; std::cout << "[SPLIT-K AUTODEDUCE] Conv groups: " << conv_G << std::endl; std::cout << "[SPLIT-K AUTODEDUCE] Optimal split value: " << optimal_split << std::endl; - std::cout << "[SPLIT-K AUTODEDUCE] Maximum k_batch value: " << k_batch_max << std::endl; + //std::cout << "[SPLIT-K AUTODEDUCE] Target batch size: " << target_batch_size << std::endl; std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << " for K-batch."<< std::endl; } return k_batch; diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 4941ae5e3a..de9fa3820a 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -249,6 +249,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, float max_accumulated_value = 0; if(do_verification) { + std::cout << "Running reference implementation for verification..." << std::endl; auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight