From f8fc165140cbfef7ec295bd2e12839a4723a384a Mon Sep 17 00:00:00 2001 From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com> Date: Thu, 14 Nov 2024 09:40:50 -0700 Subject: [PATCH] Fix example_convnd_fwd_max_xdl_int8 failures on MI300 (#1666) * Improve test verbosity. * BUGFIX: Add missing initialization for reduction buffer * Change default initialization method Performance may be affected for fp32 and int8 examples. * Improve test verbosity * Cleanup [ROCm/composable_kernel commit: d805a461aae7454de448bc0305cce01192fbc198] --- .../common.hpp | 2 +- .../run_convnd_fwd_max_example.inc | 57 +++++++++++++------ .../gemm_add_add_mean_meansquare_xdl_fp16.cpp | 2 +- 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp index 7e3130a1a1..036f288d0a 100644 --- a/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp +++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/common.hpp @@ -80,7 +80,7 @@ using RLayout = typename LayoutSettingSelector::RLayout; struct ExecutionConfig final { bool do_verification = true; - int init_method = 1; + int init_method = 2; bool time_kernel = false; }; diff --git a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc index cebfeb51d6..d61aee81a4 100644 --- a/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc +++ b/example/10_convnd_fwd_multiple_d_multiple_reduce/run_convnd_fwd_max_example.inc @@ -73,16 +73,25 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, Tensor conv_output_device(conv_output_g_n_k_wos_desc); Tensor r0_device(r0_desc); + std::cout << "input: " << conv_input.mDesc << std::endl; + std::cout << "weight: " << conv_weight.mDesc << std::endl; + std::cout << "output: " << conv_output_device.mDesc << std::endl; + std::cout << "reduction: " << r0_device.mDesc << std::endl << std::endl; + switch(config.init_method) { case 0: break; case 1: ck::utils::FillUniformDistributionIntegerValue{-8, 7}(conv_input); - ck::utils::FillUniformDistributionIntegerValue{-8, 7}(conv_weight); + ck::utils::FillUniformDistributionIntegerValue{-1, 1}(conv_weight); + break; + case 2: + ck::utils::FillUniformDistributionIntegerValue{-8, 7}(conv_input); + ck::utils::FillUniformDistribution{-1, 1}(conv_weight); break; default: - ck::utils::FillUniformDistribution{-5, 5}(conv_input); - ck::utils::FillUniformDistribution{-5, 5}(conv_weight); + ck::utils::FillUniformDistribution{-8, 7}(conv_input); + ck::utils::FillUniformDistribution{-1, 1}(conv_weight); } DeviceMem conv_input_device_buf(sizeof(ADataType) * conv_input.mDesc.GetElementSpaceSize()); @@ -161,15 +170,25 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, return false; } + // XXX: DeviceGroupedConvFwdMultipleDMultipleR_Xdl_CShuffle will not initialize r0. + r0_device_buf.SetValue(ck::NumericLimits::Lowest()); + const float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); - const std::size_t flop = problem_size.GetFlops(); - const std::size_t num_btype = problem_size.GetByte(); + if(config.time_kernel) + { + const std::size_t flop = problem_size.GetFlops(); + const std::size_t num_btype = problem_size.GetByte(); - const float tflops = static_cast(flop) / 1.E9 / avg_time; - const float gb_per_sec = num_btype / 1.E6 / avg_time; - std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " - << conv.GetTypeString() << std::endl; + const float tflops = static_cast(flop) / 1.E9 / avg_time; + const float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << conv.GetTypeString() << std::endl; + } + else + { + std::cout << "FINISHED: " << conv.GetTypeString() << std::endl; + } if(config.do_verification) { @@ -189,6 +208,7 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, BElementOp{}, PassThrough{}); + std::cout << "\nRunning verification on CPU." << std::endl; ref_invoker.Run(ref_argument); Tensor r0_host(r0_device.mDesc); @@ -273,13 +293,18 @@ bool run_convnd_fwd_max(const ck::utils::conv::ConvParam& problem_size, conv_output_device_buf.FromDevice(conv_output_device.mData.data()); r0_device_buf.FromDevice(r0_device.mData.data()); - return ck::utils::check_err(conv_output_device, - conv_output_host, - "Error: incorrect results! (Matrix E)", - 1e-5f, - 1e-4f) && - ck::utils::check_err( - r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-5f, 1e-4f); + auto pass = ck::utils::check_err(conv_output_device, + conv_output_host, + "Error: incorrect results! (Matrix E)", + 1e-3f, + 1e-3f); + pass = + pass && ck::utils::check_err( + r0_device, r0_host, "Error: incorrect results! (Matrix R0)", 1e-3f, 1e-3f); + if(pass) + std::cout << "Verification on CPU: PASS" << std::endl; + + return pass; } return true; diff --git a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp index 2f6533d448..a46eaa4816 100644 --- a/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp +++ b/example/16_gemm_multi_d_multi_reduces/gemm_add_add_mean_meansquare_xdl_fp16.cpp @@ -198,7 +198,7 @@ int main() throw std::runtime_error("wrong! this device_op instance does not support this problem"); } - // init reducetion buffer to 0 + // init reduction buffer to 0 r0_device_buf.SetZero(); r1_device_buf.SetZero();