diff --git a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp index d25c15909e..862d965e5e 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp @@ -175,19 +175,40 @@ template (conv, grids, blocks, 0, kargs), - ck_tile::make_kernel(elementwise_op, - kGridSize, - kBlockSize, - 0, - input_size, - ck_tile::make_tuple(shape[1], 1), // Input Stride - ck_tile::make_tuple(shape[1], 1), // Output Stride - input_tensors, - static_cast(c_ptr)))); + if(s_conf.flush_cache_) + { + return RunResult::from_runtime(ck_tile::launch_kernel_time_mask_flush_cache( + s_conf, + preprocess, + ck_tile::make_kernel(conv, grids, blocks, 0, kargs), + ck_tile::make_kernel( + elementwise_op, + kGridSize, + kBlockSize, + 0, + input_size, + ck_tile::make_tuple(shape[1], 1), // Input Stride + ck_tile::make_tuple(shape[1], 1), // Output Stride + input_tensors, + static_cast(c_ptr)))); + } + else + { + return RunResult::from_runtime(ck_tile::launch_kernel_time_mask( + s_conf, + preprocess, + ck_tile::make_kernel(conv, grids, blocks, 0, kargs), + ck_tile::make_kernel( + elementwise_op, + kGridSize, + kBlockSize, + 0, + input_size, + ck_tile::make_tuple(shape[1], 1), // Input Stride + ck_tile::make_tuple(shape[1], 1), // Output Stride + input_tensors, + static_cast(c_ptr)))); + } } } // namespace detail diff --git a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp index e25c03b1a8..01078ba67f 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_multiple_d_xdl_cshuffle_v3.hpp @@ -479,7 +479,7 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 const bool has_main_k_block_loop = GridwiseGemm::CalculateHasMainKBlockLoop(K_split); const auto Run = [&](const auto& kernel) { - if(stream_config.flush_cache) + if(stream_config.flush_cache && stream_config.rotating_count > 1) { std::array DsSize; @@ -534,6 +534,27 @@ struct DeviceBatchedGemmMultiD_Xdl_CShuffle_V3 0, arg_); } + else if(stream_config.flush_cache) + { + const auto clear_workspace = [&]() { + if(arg.KBatch > 1) + hipGetErrorString( + hipMemsetAsync(arg.p_c_grid, + 0, + arg.Batch * arg.M * arg.N * sizeof(CDataType), + stream_config.stream_id_)); + }; + + BatchGemmArgument arg_ = reinterpret_cast(arg); + ave_time = + launch_and_time_kernel_with_preprocess_flush_cache(stream_config, + clear_workspace, + kernel, + dim3(gdx, gdy, gdz), + dim3(BlockSize), + 0, + arg_); + } else { const auto clear_workspace = [&]() { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp index 5228bdee98..99ec3387dc 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp @@ -1031,30 +1031,14 @@ struct DeviceGroupedConvBwdWeightTwoStage_Xdl_CShuffle const auto Run = [&](const auto& kernel) { if(stream_config.flush_cache) { - typename GridwiseGemm::Argument gemm_arg_ = gemm_arg; - ck::utility::RotatingMemWrapper rotating_mem( - gemm_arg_, - stream_config.rotating_count, - gemm_arg_.M * gemm_arg_.K * sizeof(ADataType), - gemm_arg_.K * gemm_arg_.N * sizeof(BDataType)); - rotating_mem.Print(); - - auto run_flush_cache = [&]() { - // flush icache - ck::utility::flush_icache(); - // rotating mem - rotating_mem.Next(); - clear_workspace(); - }; - - ave_time += ck::utility::launch_and_time_kernel_with_preprocess( + ave_time += launch_and_time_kernel_with_preprocess_flush_cache( stream_config, - run_flush_cache, + clear_workspace, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, - gemm_arg_, + gemm_arg, arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp index 585454221a..46a9009f83 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp @@ -998,30 +998,58 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffle hip_check_error(hipMemsetAsync( p_e_grid, 0, arg.c_space_size_bytes, stream_config.stream_id_)); }; - - avg_time += launch_and_time_kernel_with_preprocess( - stream_config, - clear_workspace, - kernel, - dim3(grid_size), - dim3(BlockSize), - 0, - p_a_grid, - p_b_grid, - p_e_grid, - arg.a_element_op_, - arg.b_element_op_, - arg.c_element_op_, - arg.Conv_G_, - arg.a_grid_desc_kbatch_k0_m_k1_, - arg.b_grid_desc_kbatch_k0_n_k1_, - c_grid_desc_mblock_mperblock_nblock_nperblock, - arg.block_2_ctile_map_, - arg.compute_ptr_offset_of_batch_, - arg.split_k_stride_a_, - arg.split_k_stride_b_, - arg.split_k_offset_hack_, - arg.k_batch_); + if(stream_config.flush_cache) + { + avg_time += launch_and_time_kernel_with_preprocess_flush_cache( + stream_config, + clear_workspace, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_e_grid, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.Conv_G_, + arg.a_grid_desc_kbatch_k0_m_k1_, + arg.b_grid_desc_kbatch_k0_n_k1_, + c_grid_desc_mblock_mperblock_nblock_nperblock, + arg.block_2_ctile_map_, + arg.compute_ptr_offset_of_batch_, + arg.split_k_stride_a_, + arg.split_k_stride_b_, + arg.split_k_offset_hack_, + arg.k_batch_); + } + else + { + avg_time += launch_and_time_kernel_with_preprocess( + stream_config, + clear_workspace, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + p_a_grid, + p_b_grid, + p_e_grid, + arg.a_element_op_, + arg.b_element_op_, + arg.c_element_op_, + arg.Conv_G_, + arg.a_grid_desc_kbatch_k0_m_k1_, + arg.b_grid_desc_kbatch_k0_n_k1_, + c_grid_desc_mblock_mperblock_nblock_nperblock, + arg.block_2_ctile_map_, + arg.compute_ptr_offset_of_batch_, + arg.split_k_stride_a_, + arg.split_k_stride_b_, + arg.split_k_offset_hack_, + arg.k_batch_); + } }; if(has_main_k0_block_loop) diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp index 718b04b955..2ab60581e7 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle_v3.hpp @@ -805,29 +805,14 @@ struct DeviceGroupedConvBwdWeight_Xdl_CShuffleV3 const auto Run = [&](const auto& kernel) { if(stream_config.flush_cache) { - typename GridwiseGemm::Argument gemm_arg_ = gemm_arg; - ck::utility::RotatingMemWrapper rotating_mem( - gemm_arg_, - stream_config.rotating_count, - gemm_arg_.M * gemm_arg_.K * sizeof(ADataType), - gemm_arg_.K * gemm_arg_.N * sizeof(BDataType)); - rotating_mem.Print(); - - auto run_flush_cache = [&]() { - // flush icache - ck::utility::flush_icache(); - // rotating mem - rotating_mem.Next(); - clear_workspace(); - }; - ave_time += ck::utility::launch_and_time_kernel_with_preprocess( + ave_time += launch_and_time_kernel_with_preprocess_flush_cache( stream_config, - run_flush_cache, + clear_workspace, kernel, dim3(gdx, gdy, gdz), dim3(BlockSize), 0, - gemm_arg_, + gemm_arg, arg.a_grid_desc_k0_m_k1_, arg.b_grid_desc_k0_n_k1_, arg.c_grid_desc_mblock_mperblock_nblock_nperblock_, diff --git a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp index cf9a40b274..f69c5bb7a1 100644 --- a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp +++ b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp @@ -114,7 +114,8 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args& args, const ckt::Outputs& outputs, const ck_tile::stream_config& s_conf) { - float best_avg_time = std::numeric_limits::max(); + bool dummy_run_executed = false; + float best_avg_time = std::numeric_limits::max(); std::string best_op_name, op_name; int best_split_k; bool is_supported; @@ -154,6 +155,13 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args& args, { ckt::Args args_k_batch = args; args_k_batch.k_batch = k_batch; + if((s_conf.time_kernel_ || s_conf.flush_cache_) && !dummy_run_executed) + { + // Run first instance twice when profiling to stabilize timing + std::tie(is_supported, avg_time, op_name) = + run_alg_func(args_k_batch, inputs, outputs, s_conf); + dummy_run_executed = true; + } std::tie(is_supported, avg_time, op_name) = run_alg_func(args_k_batch, inputs, outputs, s_conf); if(is_supported) diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 9b7a68224f..fb4d312249 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -272,7 +272,8 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, index_t valid_instances = 0; // profile device Conv instances - bool all_pass = true; + bool all_pass = true; + bool dummy_run_executed = false; std::array input_lengths{}; std::array filter_lengths{}; @@ -400,8 +401,25 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, auto invoker_ptr = op_ptr->MakeInvokerPointer(); - float avg_time = - invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + if(time_kernel && !dummy_run_executed) + { + // Run first instance as dummy to get proper time from the first instance + invoker_ptr->Run(argument_ptr.get(), + StreamConfig{nullptr, + time_kernel, + 0 /*log_level*/, + 5 /*cold_iters*/, + 50 /*nrepeat_*/, + time_kernel /*flush_cache*/}); + dummy_run_executed = true; + } + float avg_time = invoker_ptr->Run(argument_ptr.get(), + StreamConfig{nullptr, + time_kernel, + 0 /*log_level*/, + 5 /*cold_iters*/, + 50 /*nrepeat_*/, + time_kernel /*flush_cache*/}); std::size_t flop = conv_param.GetFlops(); std::size_t num_btype = conv_param.GetByte(); diff --git a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp index 2c8258f280..7ee82fe8a9 100644 --- a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp +++ b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp @@ -141,7 +141,8 @@ int call_profiler(const ckt::Args& args, const std::string& split_k, 0 /*log_level*/, 5 /*cold_iters*/, 50 /*nrepeat_*/, - true /*is_gpu_timer_*/}); + true /*is_gpu_timer_*/, + time_kernel /*flush_cache*/}); if(time_kernel) { std::cout << "\nBest configuration parameters:" << "\n\tname: " << op_name @@ -208,6 +209,14 @@ int profile_grouped_conv_bwd_weight_tile(int argc, char* argv[]) split_k, time_kernel); } + else if(data_type == ConvDataType::F32_F32_F32) + { + constexpr auto SIGNATURE = ckp::SIGNATURE_NHWGC_FP32_BWD_WEIGHT; + return call_profiler( + ckp::parse_conv_args(conv_params_start_idx, argv), + split_k, + time_kernel); + } } else if(num_dim_spatial == 3) { @@ -227,6 +236,14 @@ int profile_grouped_conv_bwd_weight_tile(int argc, char* argv[]) split_k, time_kernel); } + else if(data_type == ConvDataType::F32_F32_F32) + { + constexpr auto SIGNATURE = ckp::SIGNATURE_NDHWGC_FP32_BWD_WEIGHT; + return call_profiler( + ckp::parse_conv_args(conv_params_start_idx, argv), + split_k, + time_kernel); + } } }