diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp index 1207e4ec65..2ea08300ba 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp @@ -427,13 +427,7 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle { MaximumActiveBlocksPerMultiprocessor() { - constexpr size_t dynSharedMemPerBlk = 0; - constexpr size_t ldsMemPerBlk = GridwiseGemm::GetSharedMemoryNumberOfByte(); - if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING))) - { - std::cout << "[SPLIT-K AUTODEDUCE] Dynamic shared memory per block: " << dynSharedMemPerBlk << " bytes" << std::endl; - std::cout << "[SPLIT-K AUTODEDUCE] LDS memory per block: " << ldsMemPerBlk << " bytes" << std::endl; - } + constexpr int dynamic_smem_size = 0; int max_occupancy = 0; hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor( &max_occupancy, @@ -450,9 +444,9 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle remove_reference_t, remove_reference_t, ComputePtrOffsetOfStridedBatch<>, - true>, // TODO: Do we need to test both true/false for HasMainKBlockLoop? + false>, // Both true/false give the same occupancy. BlockSize, - dynSharedMemPerBlk)); + dynamic_smem_size)); value_ = std::max(1, max_occupancy); } int value_; @@ -553,14 +547,13 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle input_right_pads, k_batch_initial); - const auto& a_grid_desc_kbatch_k0_m_k1 = descs_initial[I0]; const auto& c_grid_desc_m_n = descs_initial[I2]; const auto& block_2_ctile_map = GridwiseGemm::MakeCBlockClusterAdaptor(c_grid_desc_m_n, M01, N01, k_batch_initial); - const auto grid_size = block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n); - const auto k_size = a_grid_desc_kbatch_k0_m_k1.GetLength(I0) * a_grid_desc_kbatch_k0_m_k1.GetLength(I1); - //const auto multiplier = static_cast(-split_k); - k_batch_ = get_k_batch_value(max_occupancy.value_, grid_size, k_size, Conv_G_/*, multiplier*/); + // Max occupancy is calculated for a batched GEMM kernel where the batch size corresponds to the number of convolution groups. + // Hence, the grid is just size of the tile map. + const auto grid_size = block_2_ctile_map.CalculateGridSize(c_grid_desc_m_n); + k_batch_ = get_k_batch_value(max_occupancy.value_, grid_size); } else { k_batch_ = split_k; diff --git a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp index 06b370daa1..f7188d7b93 100644 --- a/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/split_k_utils.hpp @@ -27,43 +27,14 @@ struct DeviceProperties int num_cu_; }; -inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size, ck::index_t K_size, ck::index_t conv_G /*, ck::index_t multiplier*/) +inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size) { static DeviceProperties device_properties; - //constexpr ck::index_t default_batch_size = 512; - //constexpr ck::index_t min_batch_size = 8192; - const int num_cu = device_properties.num_cu_; - // auto target_batch_size = static_cast(ck::EnvValue(CK_ENV(CK_SPLIT_K_BATCH_SIZE))); - // if (target_batch_size < min_batch_size) - // { - // target_batch_size = default_batch_size; - // } - - // The optimal split is an integer multiple of (max_occupancy * num_cu) / (1.0 * grid_size * conv_G). - // Here we take the integer to be conv_G, i.e., the number of groups. - // The number is floored to ensure that we do not exceed the maximum capacity of compute units, i.e, - // we prefer to (N-eps) * max_capacity rather than (N+eps) * max_capacity because the latter leads to - // using only eps fraction of capacity on the last wave. - // const auto optimal_split = static_cast(std::floor((max_occupancy * num_cu) / (1.0 * grid_size))); - // auto k_batch = 1; - // if (optimal_split > 0 && K_size > target_batch_size) - // { - // //The optimal value of k_batch is a multiple of the optimal_split. - // //We need to find the optimal number K values per batch - this gives the optimal k_batch value. - // k_batch = optimal_split; - // const auto current_batch_size = math::integer_divide_ceil(K_size, k_batch); - // if (current_batch_size > target_batch_size) - // { - // // If the current batch size is larger than the target batch size, we need to increase k_batch. - // const ck::index_t multiplier = std::max(1, math::integer_divide_ceil(K_size, target_batch_size * optimal_split)); - // k_batch = optimal_split * multiplier; - // } - // } - auto k_batch = 1; + constexpr ck::index_t num_waves = 1; - const auto optimal_split = static_cast(std::floor((max_occupancy * num_cu) / (num_waves * grid_size * conv_G))); + const auto optimal_split = static_cast(std::floor((max_occupancy * num_cu) / (num_waves * grid_size))); if (optimal_split > 1) { k_batch = optimal_split; @@ -73,10 +44,7 @@ inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size, c { std::cout << "[SPLIT-K AUTODEDUCE] Max active thread blocks per CU for GEMM kernel: " << max_occupancy << std::endl; std::cout << "[SPLIT-K AUTODEDUCE] Output grid size (M tiles x N tiles): " << grid_size << std::endl; - std::cout << "[SPLIT-K AUTODEDUCE] K-dim size: " << K_size << std::endl; - std::cout << "[SPLIT-K AUTODEDUCE] Conv groups: " << conv_G << std::endl; std::cout << "[SPLIT-K AUTODEDUCE] Optimal split value: " << optimal_split << std::endl; - //std::cout << "[SPLIT-K AUTODEDUCE] Target batch size: " << target_batch_size << std::endl; std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << " for K-batch."<< std::endl; } return k_batch;