mirror of
https://github.com/NVIDIA/cutlass.git
synced 2026-05-11 08:50:09 +00:00
Fix SM90 beta=1 hang and stream-K launch errors (#2172)
* Fix stream-K occupancy calculation * Fix beta=1 hang
This commit is contained in:
@@ -661,9 +661,9 @@ public:
|
||||
// Converge before issuing tensormap fence release since fence is aligned
|
||||
__syncwarp();
|
||||
collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
|
||||
}
|
||||
|
||||
load_order_barrier.wait();
|
||||
load_order_barrier.wait();
|
||||
}
|
||||
|
||||
while (work_tile_info.is_valid()) {
|
||||
int32_t curr_batch = work_tile_info.L_idx;
|
||||
|
||||
@@ -693,10 +693,10 @@ public:
|
||||
// Converge before issuing tensormap fence release since fence is aligned
|
||||
__syncwarp();
|
||||
collective_epilogue.template tensormaps_cp_fence_release<IsEpiLoad>(shared_storage.tensormaps.epilogue, epi_load_tensormap, 0);
|
||||
|
||||
load_order_barrier.wait();
|
||||
}
|
||||
|
||||
load_order_barrier.wait();
|
||||
|
||||
while (work_tile_info.is_valid()) {
|
||||
int32_t curr_batch = work_tile_info.L_idx;
|
||||
|
||||
|
||||
@@ -548,8 +548,13 @@ public:
|
||||
// unflushed global memory prior to this instruction
|
||||
cutlass::arch::wait_on_dependent_grids();
|
||||
|
||||
load_order_barrier.wait();
|
||||
bool do_load_order_wait = true;
|
||||
while (work_tile_info.is_valid()) {
|
||||
if (do_load_order_wait) {
|
||||
load_order_barrier.wait();
|
||||
do_load_order_wait = false;
|
||||
}
|
||||
|
||||
// Compute m_coord, n_coord, l_coord with the post-tiled m-shape and n-shape
|
||||
auto m_coord = idx2crd(work_tile_info.M_idx, shape<2>(gA_mkl));
|
||||
auto n_coord = idx2crd(work_tile_info.N_idx, shape<2>(gB_nkl));
|
||||
|
||||
@@ -188,8 +188,8 @@ struct PersistentTileSchedulerSm90Params {
|
||||
KernelHardwareInfo hw_info,
|
||||
int max_swizzle_size,
|
||||
RasterOrderOptions raster_order_option,
|
||||
bool truncate_by_problem_size=true
|
||||
, bool bypass_occupancy_calculation=false
|
||||
bool truncate_by_problem_size=true,
|
||||
bool bypass_sm90_occupancy_calculation=false
|
||||
) {
|
||||
|
||||
dim3 problem_blocks = get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
|
||||
@@ -199,8 +199,8 @@ struct PersistentTileSchedulerSm90Params {
|
||||
hw_info,
|
||||
max_swizzle_size,
|
||||
raster_order_option,
|
||||
truncate_by_problem_size
|
||||
, bypass_occupancy_calculation
|
||||
truncate_by_problem_size,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
}
|
||||
|
||||
@@ -215,8 +215,8 @@ struct PersistentTileSchedulerSm90Params {
|
||||
KernelHardwareInfo hw_info,
|
||||
int max_swizzle_size,
|
||||
RasterOrderOptions raster_order_option,
|
||||
bool truncate_by_problem_size=true
|
||||
, bool bypass_occupancy_calculation=false
|
||||
bool truncate_by_problem_size=true,
|
||||
bool bypass_sm90_occupancy_calculation=false
|
||||
) {
|
||||
|
||||
int const sm_count = hw_info.sm_count;
|
||||
@@ -267,24 +267,28 @@ struct PersistentTileSchedulerSm90Params {
|
||||
// already calculated using cudaOccupancyMaxActiveClusters
|
||||
else if (max_active_clusters != 0 && max_active_clusters * cluster_size <= sm_count) {
|
||||
if (raster_order == RasterOrder::AlongN) {
|
||||
launch_grid.y = max_active_clusters * cluster_shape.n();
|
||||
launch_grid.y = possibly_truncate(
|
||||
max_active_clusters * cluster_shape.n(),
|
||||
problem_blocks_total / cluster_shape.m());
|
||||
}
|
||||
else {
|
||||
launch_grid.x = max_active_clusters * cluster_shape.m();
|
||||
launch_grid.x = possibly_truncate(
|
||||
max_active_clusters * cluster_shape.m(),
|
||||
problem_blocks_total / cluster_shape.n());
|
||||
}
|
||||
CUTLASS_TRACE_HOST("get_grid_shape(): Proposed GridDims by the scheduler using cudaOccupancyMaxActiveClusters = "
|
||||
"(" << launch_grid.x << ", " << launch_grid.y << ", " << launch_grid.z << ")\n");
|
||||
}
|
||||
else {
|
||||
int cta_per_device = sm_count;
|
||||
if (!bypass_occupancy_calculation) {
|
||||
/*
|
||||
* Optimal grid size calculation is based on
|
||||
* GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
|
||||
* Hence, maximum SMs per GPC = 18
|
||||
*/
|
||||
constexpr int max_sm_per_gpc = 18;
|
||||
cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
|
||||
if (!bypass_sm90_occupancy_calculation) {
|
||||
/*
|
||||
* Optimal grid size calculation is based on
|
||||
* GH100: 8 GPCs, 72 TPCs (9 TPCs/GPC), 2 SMs/TPC, 144 SMs per full GPU
|
||||
* Hence, maximum SMs per GPC = 18
|
||||
*/
|
||||
constexpr int max_sm_per_gpc = 18;
|
||||
cta_per_device = get_max_cta_occupancy(max_sm_per_gpc, cluster_shape, sm_count);
|
||||
}
|
||||
|
||||
if (raster_order == RasterOrder::AlongN) {
|
||||
@@ -573,8 +577,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
ReductionMode reduction_mode,
|
||||
DecompositionMode decomposition_mode,
|
||||
void* workspace,
|
||||
const uint32_t epilogue_subtile = 1u
|
||||
, uint32_t ktile_start_alignment_count = 1u
|
||||
const uint32_t epilogue_subtile = 1u,
|
||||
uint32_t ktile_start_alignment_count = 1u,
|
||||
bool bypass_sm90_occupancy_calculation=false
|
||||
) {
|
||||
dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(
|
||||
problem_shape, tile_shape, cluster_shape);
|
||||
@@ -593,8 +598,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
reduction_mode,
|
||||
decomposition_mode,
|
||||
workspace,
|
||||
epilogue_subtile
|
||||
, ktile_start_alignment_count
|
||||
epilogue_subtile,
|
||||
ktile_start_alignment_count,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
}
|
||||
|
||||
@@ -613,8 +619,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
ReductionMode reduction_mode,
|
||||
DecompositionMode decomposition_mode,
|
||||
void* workspace,
|
||||
const uint32_t epilogue_subtile = 1
|
||||
, uint32_t ktile_start_alignment_count = 1u
|
||||
const uint32_t epilogue_subtile = 1,
|
||||
uint32_t ktile_start_alignment_count = 1u,
|
||||
bool bypass_sm90_occupancy_calculation=false
|
||||
) {
|
||||
|
||||
#if !defined(__CUDACC_RTC__)
|
||||
@@ -650,7 +657,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
raster_order_option,
|
||||
decomposition_mode,
|
||||
reduction_mode,
|
||||
epilogue_subtile
|
||||
epilogue_subtile,
|
||||
ktile_start_alignment_count,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
}
|
||||
|
||||
@@ -741,19 +750,19 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
// Stream-K kernel use below function to set stream-K feature related parameters to choose
|
||||
// optimal/customized decomposition mode.
|
||||
void stream_k_heuristic(
|
||||
UnderlyingParams underlying_params,
|
||||
dim3 problem_blocks,
|
||||
uint32_t k_tiles_per_output_tile,
|
||||
GemmCoord cluster_shape,
|
||||
KernelHardwareInfo hw_info,
|
||||
int splits,
|
||||
int max_swizzle,
|
||||
RasterOrderOptions raster_order_option,
|
||||
DecompositionMode decomposition_mode,
|
||||
ReductionMode reduction_mode,
|
||||
const uint32_t epilogue_subtile = 1
|
||||
, uint32_t ktile_start_alignment_count = 1u
|
||||
) {
|
||||
UnderlyingParams underlying_params,
|
||||
dim3 problem_blocks,
|
||||
uint32_t k_tiles_per_output_tile,
|
||||
GemmCoord cluster_shape,
|
||||
KernelHardwareInfo hw_info,
|
||||
int splits,
|
||||
int max_swizzle,
|
||||
RasterOrderOptions raster_order_option,
|
||||
DecompositionMode decomposition_mode,
|
||||
ReductionMode reduction_mode,
|
||||
const uint32_t epilogue_subtile = 1,
|
||||
uint32_t ktile_start_alignment_count = 1u,
|
||||
bool bypass_sm90_occupancy_calculation=false) {
|
||||
uint32_t groups = 0;
|
||||
uint32_t sk_tiles = 0;
|
||||
uint64_t sk_units = 0;
|
||||
@@ -785,8 +794,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
raster_order_option,
|
||||
decomposition_mode,
|
||||
reduction_mode,
|
||||
epilogue_subtile
|
||||
, ktile_start_alignment_count
|
||||
epilogue_subtile,
|
||||
ktile_start_alignment_count,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
|
||||
// Given heuristic_mode returned from the heuristic() method, set params fields.
|
||||
@@ -809,8 +819,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
cluster_shape,
|
||||
splits,
|
||||
epilogue_subtile,
|
||||
reduction_mode
|
||||
, ktile_start_alignment_count
|
||||
reduction_mode,
|
||||
ktile_start_alignment_count
|
||||
);
|
||||
}
|
||||
|
||||
@@ -835,8 +845,9 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
RasterOrderOptions raster_order_option,
|
||||
DecompositionMode decomposition_mode,
|
||||
ReductionMode reduction_mode,
|
||||
uint32_t epilogue_subtile
|
||||
, uint32_t ktile_start_alignment_count
|
||||
uint32_t epilogue_subtile,
|
||||
uint32_t ktile_start_alignment_count,
|
||||
bool bypass_sm90_occupancy_calculation=false
|
||||
) {
|
||||
|
||||
// Get block numbers in m, n and l dimensions
|
||||
@@ -863,7 +874,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
cluster_shape,
|
||||
hw_info,
|
||||
max_swizzle,
|
||||
raster_order_option
|
||||
raster_order_option,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
uint64_t ctas_per_wave = grid.x * grid.y;
|
||||
cluster_size = cluster_shape.m() * cluster_shape.n();
|
||||
@@ -875,8 +887,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
ctas_per_wave,
|
||||
cluster_size,
|
||||
k_tiles_per_output_tile,
|
||||
decomposition_mode
|
||||
, ctas_per_wave_in_full_clusters
|
||||
decomposition_mode,
|
||||
ctas_per_wave_in_full_clusters
|
||||
);
|
||||
uint64_t dp_tiles = output_tiles - sk_tiles;
|
||||
// Calculate the number of work units covering the data-parallel and stream-K tiles.
|
||||
@@ -1035,7 +1047,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
GemmCoord cluster_shape,
|
||||
KernelHardwareInfo hw_info,
|
||||
int max_swizzle_size,
|
||||
RasterOrderOptions raster_order_option
|
||||
RasterOrderOptions raster_order_option,
|
||||
bool bypass_sm90_occupancy_calculation=false
|
||||
) {
|
||||
|
||||
dim3 problem_blocks = UnderlyingParams::get_tiled_cta_shape_mnl(problem_shape, cta_shape, cluster_shape);
|
||||
@@ -1045,7 +1058,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
cluster_shape,
|
||||
hw_info,
|
||||
max_swizzle_size,
|
||||
raster_order_option
|
||||
raster_order_option,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1059,7 +1073,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
GemmCoord cluster_shape,
|
||||
KernelHardwareInfo hw_info,
|
||||
int max_swizzle_size,
|
||||
RasterOrderOptions raster_order_option
|
||||
RasterOrderOptions raster_order_option,
|
||||
bool bypass_sm90_occupancy_calculation=false
|
||||
) {
|
||||
|
||||
// Call into the underlying get_grid_shape method, but do not allow the grid shape returned
|
||||
@@ -1070,8 +1085,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
hw_info,
|
||||
max_swizzle_size,
|
||||
raster_order_option,
|
||||
/* truncate_by_problem_size = */false
|
||||
/* bypass_occupancy_calculation = */, true
|
||||
/* truncate_by_problem_size = */false,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1186,7 +1201,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
uint32_t accumulator_bits,
|
||||
uint32_t epilogue_subtile = 1,
|
||||
uint32_t num_accumulator_mtxs = 1,
|
||||
uint32_t ktile_start_alignment_count = 1) {
|
||||
uint32_t ktile_start_alignment_count = 1,
|
||||
bool bypass_sm90_occupancy_calculation=false) {
|
||||
|
||||
auto log_swizzle_size = UnderlyingParams::get_log_swizzle_size(problem_blocks.x, problem_blocks.y, max_swizzle);
|
||||
problem_blocks.x = round_up(problem_blocks.x, (1 << log_swizzle_size) * cluster_shape.m());
|
||||
@@ -1216,7 +1232,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
cluster_shape,
|
||||
new_hw_info,
|
||||
max_swizzle,
|
||||
raster_order_option
|
||||
raster_order_option,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
uint64_t ctas_per_wave = grid.x * grid.y;
|
||||
uint64_t cluster_size = cluster_shape.m() * cluster_shape.n();
|
||||
@@ -1347,7 +1364,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
uint32_t element_accumulator_bits,
|
||||
uint32_t epilogue_subtile = 1,
|
||||
uint32_t num_accumulator_mtxs = 1,
|
||||
uint32_t ktile_start_alignment_count = 1) {
|
||||
uint32_t ktile_start_alignment_count = 1,
|
||||
bool bypass_sm90_occupancy_calculation=false) {
|
||||
|
||||
size_t barrier_workspace_size = 0;
|
||||
size_t reduction_workspace_size = 0;
|
||||
@@ -1371,7 +1389,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
element_accumulator_bits,
|
||||
epilogue_subtile,
|
||||
num_accumulator_mtxs,
|
||||
ktile_start_alignment_count
|
||||
ktile_start_alignment_count,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
#endif
|
||||
|
||||
@@ -1449,7 +1468,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
uint32_t epilogue_subtile = 1,
|
||||
uint32_t num_accumulator_mtxs = 1,
|
||||
CudaHostAdapter* cuda_adapter = nullptr,
|
||||
uint32_t ktile_start_alignment_count = 1) {
|
||||
uint32_t ktile_start_alignment_count = 1,
|
||||
bool bypass_sm90_occupancy_calculation=false) {
|
||||
|
||||
#if !defined(__CUDACC_RTC__)
|
||||
uint64_t barrier_workspace_size = 0;
|
||||
@@ -1473,7 +1493,8 @@ struct PersistentTileSchedulerSm90StreamKParams {
|
||||
element_accumulator_bits,
|
||||
epilogue_subtile,
|
||||
num_accumulator_mtxs,
|
||||
ktile_start_alignment_count
|
||||
ktile_start_alignment_count,
|
||||
bypass_sm90_occupancy_calculation
|
||||
);
|
||||
|
||||
if (barrier_workspace_size > 0) {
|
||||
@@ -2165,7 +2186,8 @@ struct PersistentTileSchedulerSm100StreamKParams {
|
||||
decomposition_mode,
|
||||
workspace,
|
||||
/*epilogue_subtile=*/1,
|
||||
ktile_start_alignment_count
|
||||
ktile_start_alignment_count,
|
||||
/*bypass_sm90_occupancy_calculation=*/true
|
||||
);
|
||||
|
||||
log_swizzle_size_ = sk_params_.log_swizzle_size_;
|
||||
@@ -2334,7 +2356,8 @@ struct PersistentTileSchedulerSm100StreamKParams {
|
||||
element_accumulator_bits,
|
||||
epilogue_subtile,
|
||||
num_accumulator_mtxs,
|
||||
ktile_start_alignment_count
|
||||
ktile_start_alignment_count,
|
||||
/*bypass_sm90_occupancy_calculation=*/true
|
||||
);
|
||||
}
|
||||
|
||||
@@ -2431,7 +2454,8 @@ struct PersistentTileSchedulerSm100StreamKParams {
|
||||
epilogue_subtile,
|
||||
num_accumulator_mtxs,
|
||||
cuda_adapter,
|
||||
ktile_start_alignment_count
|
||||
ktile_start_alignment_count,
|
||||
/*bypass_sm90_occupancy_calculation=*/true
|
||||
);
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user