Fix a bug in occupancy estimation.

This commit is contained in:
Ville Pietilä
2025-06-12 09:41:17 +00:00
parent 1e1917c5bc
commit 8e965d07ab
3 changed files with 37 additions and 22 deletions

View File

@@ -427,10 +427,12 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle
{
MaximumActiveBlocksPerMultiprocessor()
{
constexpr size_t dynSharedMemPerBlk = GridwiseGemm::GetSharedMemoryNumberOfByte();
constexpr size_t dynSharedMemPerBlk = 0;
constexpr size_t ldsMemPerBlk = GridwiseGemm::GetSharedMemoryNumberOfByte();
if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "[SPLIT-K AUTODEDUCE] Dynamic shared memory per block: " << dynSharedMemPerBlk << " bytes" << std::endl;
std::cout << "[SPLIT-K AUTODEDUCE] LDS memory per block: " << ldsMemPerBlk << " bytes" << std::endl;
}
int max_occupancy = 0;
hip_check_error(hipOccupancyMaxActiveBlocksPerMultiprocessor(

View File

@@ -30,41 +30,53 @@ struct DeviceProperties
inline ck::index_t get_k_batch_value(int max_occupancy, ck::index_t grid_size, ck::index_t K_size, ck::index_t conv_G /*, ck::index_t multiplier*/)
{
static DeviceProperties device_properties;
constexpr ck::index_t k_batch_min = 1;
constexpr ck::index_t batch_size_min = 512;
//constexpr ck::index_t default_batch_size = 512;
//constexpr ck::index_t min_batch_size = 8192;
const int num_cu = device_properties.num_cu_;
const auto k_batch_max = math::integer_divide_ceil(K_size, batch_size_min);
// Ensure that we do not exceed the maximum capacity. This would lead to wave quantization.
const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (1.0 * grid_size * conv_G)));
// auto target_batch_size = static_cast<ck::index_t>(ck::EnvValue(CK_ENV(CK_SPLIT_K_BATCH_SIZE)));
// if (target_batch_size < min_batch_size)
// {
// target_batch_size = default_batch_size;
// }
// The optimal split is an integer multiple of (max_occupancy * num_cu) / (1.0 * grid_size * conv_G).
// Here we take the integer to be conv_G, i.e., the number of groups.
// The number is floored to ensure that we do not exceed the maximum capacity of compute units, i.e,
// we prefer to (N-eps) * max_capacity rather than (N+eps) * max_capacity because the latter leads to
// using only eps fraction of capacity on the last wave.
// const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (1.0 * grid_size)));
// auto k_batch = 1;
// if (optimal_split > 0 && K_size > target_batch_size)
// {
// //The optimal value of k_batch is a multiple of the optimal_split.
// //We need to find the optimal number K values per batch - this gives the optimal k_batch value.
// k_batch = optimal_split;
// const auto current_batch_size = math::integer_divide_ceil(K_size, k_batch);
// if (current_batch_size > target_batch_size)
// {
// // If the current batch size is larger than the target batch size, we need to increase k_batch.
// const ck::index_t multiplier = std::max(1, math::integer_divide_ceil(K_size, target_batch_size * optimal_split));
// k_batch = optimal_split * multiplier;
// }
// }
auto k_batch = 1;
constexpr ck::index_t num_waves = 1;
const auto optimal_split = static_cast<ck::index_t>(std::floor((max_occupancy * num_cu) / (num_waves * grid_size * conv_G)));
if (optimal_split > 1)
{
//The optimal value of k_batch is a multiple of the optimal_split.
//We need to find the optimal number K values per batch - this gives the optimal k_batch value.
auto target_batch_size = static_cast<ck::index_t>(ck::EnvValue(CK_ENV(CK_SPLIT_K_BATCH_SIZE)));
if (target_batch_size < k_batch_min)
{
target_batch_size = k_batch_min;
}
k_batch = optimal_split;
const auto current_batch_size = math::integer_divide_ceil(K_size, k_batch);
if (current_batch_size > target_batch_size)
{
// If the current batch size is larger than the target batch size, we need to increase k_batch.
const ck::index_t multiplier = std::max(1, math::integer_divide_ceil(K_size, target_batch_size * optimal_split));
k_batch = optimal_split * multiplier;
}
}
if (ck::EnvIsEnabled(CK_ENV(CK_LOGGING)))
{
std::cout << "[SPLIT-K AUTODEDUCE] Max active thread blocks per CU for GEMM kernel: " << max_occupancy << std::endl;
std::cout << "[SPLIT-K AUTODEDUCE] Output grid size (M tiles x N tiles x Conv groups): " << grid_size << std::endl;
std::cout << "[SPLIT-K AUTODEDUCE] Output grid size (M tiles x N tiles): " << grid_size << std::endl;
std::cout << "[SPLIT-K AUTODEDUCE] K-dim size: " << K_size << std::endl;
std::cout << "[SPLIT-K AUTODEDUCE] Conv groups: " << conv_G << std::endl;
std::cout << "[SPLIT-K AUTODEDUCE] Optimal split value: " << optimal_split << std::endl;
std::cout << "[SPLIT-K AUTODEDUCE] Maximum k_batch value: " << k_batch_max << std::endl;
//std::cout << "[SPLIT-K AUTODEDUCE] Target batch size: " << target_batch_size << std::endl;
std::cout << "[SPLIT-K AUTODEDUCE] Optimal split-k value " << k_batch << " for K-batch."<< std::endl;
}
return k_batch;

View File

@@ -249,6 +249,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
float max_accumulated_value = 0;
if(do_verification)
{
std::cout << "Running reference implementation for verification..." << std::endl;
auto ref_conv = ck::tensor_operation::host::ReferenceConvBwdWeight<NDimSpatial,
InDataType,
WeiDataType,