mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
[CK][CK Tile] Conv Bwd Data flush cache and profiling improvements (#6090)
## Motivation Improve accuracy of conv bwd data perf measurements ## Technical Details - enable flush cache - for grouped conv we zero conv input(gemm output) inside device op, so we also include this in time measurement - for non-grouped conv we zero conv input(gemm output) outside device op (in profile_conv_bwd_data_impl.hpp) so it is not included. - In this pr I changed it to include zeroing if time_kernel/flush cache is enabled so at now you should have more fair comparison. I changed it only for time_kernel/flush_cache because MIOpen run own zeroing for non-grouped solvers. ## Test Plan test_grouped_conv_bwd_data_* ## Test Result CI pending ## Submission Checklist - [x] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
This commit is contained in:
@@ -903,13 +903,11 @@ struct GroupedConvolutionBackwardDataKernel
|
||||
const auto& d_block_window =
|
||||
MakeDBlockWindows(ds_ptr, kargs, group_id, block_idx_m, block_idx_n);
|
||||
|
||||
const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitted_k));
|
||||
const bool has_hot_loop = GemmPipeline::BlockHasHotloop(num_loop);
|
||||
const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
|
||||
const index_t num_loop = amd_wave_read_first_lane(TilePartitioner::GetLoopNum(splitted_k));
|
||||
|
||||
// Run GEMM cooperatively by whole workgroup.
|
||||
const auto& c_block_tile = GemmPipeline{}.template operator()(
|
||||
a_block_window, b_block_window, num_loop, has_hot_loop, tail_num, smem_ptr_0);
|
||||
a_block_window, b_block_window, num_loop, smem_ptr_0);
|
||||
|
||||
const index_t k_batch = amd_wave_read_first_lane(kargs.k_batch);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user