diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp index 90874e6018..ce6b1ea154 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_invoker.hpp @@ -126,7 +126,7 @@ struct GroupedConvolutionBackwardWeightInvoker } auto preprocess = [&]() { - if(args.k_batch > 1) + if(kargs.k_batch > 1) { ck_tile::hip_check_error(hipMemsetAsync( kargs.wei_ptr, 0, args.template GetWeightByte(), s.stream_id_)); diff --git a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp index c4d618a0bf..6ba8c36a9c 100644 --- a/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp +++ b/example/ck_tile/20_grouped_convolution/grouped_convolution_backward_weight_two_stage_invoker.hpp @@ -180,7 +180,7 @@ struct GroupedConvolutionBackwardWeightTwoStageInvoker } auto preprocess = [&]() { - if(args.k_batch > 1) + if(kargs.k_batch > 1) ck_tile::hip_check_error( hipMemsetAsync(ws_args.wei_ptr, 0, diff --git a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp index ac6241af85..d25c15909e 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv/ck_tile.hpp @@ -6,6 +6,7 @@ #include "ck_tile/builder/testing/testing.hpp" #include "ck_tile/builder/testing/conv/fwd.hpp" #include "ck_tile/builder/testing/conv/bwd_weight.hpp" +#include "ck_tile/builder/factory/helpers/ck_tile/conv_tile_tensor_type.hpp" #include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/ops/gemm.hpp" #include "ck_tile/ops/grouped_convolution.hpp" @@ -56,6 +57,7 @@ template ; const std::size_t zeroing_size = std::accumulate(std::begin(kargs.wei_g_k_c_xs_lengths.data), std::end(kargs.wei_g_k_c_xs_lengths.data), 1, @@ -64,10 +66,13 @@ template ) { - if(args.k_batch > 1) + if(kargs.k_batch > 1) { ck_tile::hip_check_error( - hipMemsetAsync(kargs.wei_ptr, 0, zeroing_size, s_conf.stream_id_)); + hipMemsetAsync(kargs.wei_ptr, + 0, + zeroing_size * sizeof(typename Types::EDataType), + s_conf.stream_id_)); } } }; @@ -156,7 +161,7 @@ template ) { - if(args.k_batch > 1) + if(kargs.k_batch > 1) { ck_tile::hip_check_error( hipMemsetAsync(ws_args.wei_ptr, diff --git a/experimental/grouped_convolution_tile_instances/generate_instances.py b/experimental/grouped_convolution_tile_instances/generate_instances.py index d06e939859..9de431ac73 100755 --- a/experimental/grouped_convolution_tile_instances/generate_instances.py +++ b/experimental/grouped_convolution_tile_instances/generate_instances.py @@ -434,7 +434,9 @@ def parse_bwd_weight_instances(instances, problem_name): if check_vectors(a_scalar_per_vector, b_scalar_per_vector, c_scalar_per_vector) == False: print(f"Skipping instance {instance_id} with irregular load since it's not supported yet.") continue - + if pipeline_version == "V6": + print(f"Skipping instance {instance_id} with V6 since it's not supported yet.") + continue conv = ConvInstanceTemplateParams( spec, diff --git a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp index bd9c755e58..cf9a40b274 100644 --- a/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp +++ b/profiler/include/profiler/grouped_convolution_backward_weight_tile_algs.hpp @@ -73,7 +73,7 @@ void run_cpu_validation(const ckt::Args& args, template std::tuple -get_rtol_atol(const int num_accums, const int num_accums_split_k, const float max_accumulated_value) +get_rtol_atol(const int num_accums, const int k_batch, const float max_accumulated_value) { using WeiDataType = std::conditional_t 0 ? k_batch : 64; auto rtol = ck_tile::get_relative_threshold( num_accums / num_accums_split_k); auto atol = ck_tile::get_absolute_threshold( @@ -150,14 +152,17 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args& args, auto run_alg = [&](auto&& run_alg_func) { for(auto& k_batch : split_k_values) { - std::tie(is_supported, avg_time, op_name) = run_alg_func(args, inputs, outputs, s_conf); + ckt::Args args_k_batch = args; + args_k_batch.k_batch = k_batch; + std::tie(is_supported, avg_time, op_name) = + run_alg_func(args_k_batch, inputs, outputs, s_conf); if(is_supported) { ckt::ValidationReport report; auto&& [rtol, atol] = get_rtol_atol(num_accums, k_batch, max_accumulated_value); ckt::Outputs::reflect( - args, + args_k_batch, [&](std::string_view name, const auto& desc, void* ckt::Outputs::*ptr) { @@ -182,7 +187,7 @@ run_grouped_conv_backward_weight_tile_algs(const ckt::Args& args, << " Is all zero:" << error.is_all_zero() << " max err: " << error.max_error << std::endl; // Check with cpu verification to get a values - run_cpu_validation(args, outputs, reference.get()); + run_cpu_validation(args_k_batch, outputs, reference.get()); } all_instances_valid = false; } diff --git a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp index 348ea1023f..2c8258f280 100644 --- a/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp +++ b/profiler/src/profile_grouped_conv_bwd_weight_tile.cpp @@ -136,7 +136,12 @@ int call_profiler(const ckt::Args& args, const std::string& split_k, split_k, inputs.get(), outputs.get(), - ck_tile::stream_config{nullptr, time_kernel}); + ck_tile::stream_config{nullptr, + time_kernel, + 0 /*log_level*/, + 5 /*cold_iters*/, + 50 /*nrepeat_*/, + true /*is_gpu_timer_*/}); if(time_kernel) { std::cout << "\nBest configuration parameters:" << "\n\tname: " << op_name