From 74c4b5df53fd07a5db7140b2f7bfc31a87454821 Mon Sep 17 00:00:00 2001 From: Johannes Graner Date: Fri, 16 Jan 2026 19:56:53 +0100 Subject: [PATCH] [CK Profiler] Restore CPU tensor initialization when verification is not done on GPU (#3594) * Fix large case init bounds * Revert "Fix large case init bounds" This reverts commit 1abca05c6f71ff6fee83fa870d0c84d86279bb70. * Restore CPU initialization for do_verification != 2 [ROCm/composable_kernel commit: 3f735c127b8e78b702a31e19cb6e0e35eda3588a] --- .../profile_grouped_conv_bwd_data_impl.hpp | 93 +++++++++++-------- .../profile_grouped_conv_bwd_weight_impl.hpp | 74 +++++++++------ .../profile_grouped_conv_fwd_impl.hpp | 86 ++++++++++------- 3 files changed, 152 insertions(+), 101 deletions(-) diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index 20bbd58f61..eceb70c05f 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -62,7 +62,13 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, std::cout << "wei: " << wei_g_k_c_xs_desc << std::endl; std::cout << "in: " << in_g_n_c_wis_desc << std::endl; - // Get element space sizes + // Create host tensors + Tensor out(out_g_n_k_wos_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor in_host(in_g_n_c_wis_desc); + Tensor in_device(in_g_n_c_wis_desc); + + // Get element space sizes for allocation const auto out_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize(); const auto wei_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); const auto in_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize(); @@ -72,48 +78,57 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_element_space_size); DeviceMem in_device_buf(sizeof(InDataType) * in_element_space_size); - // Generate data directly on GPU using DeviceMem methods - switch(init_method) + // Initialize tensors based on do_verification: + // - do_verification=2: GPU-side initialization + // - do_verification=0,1: CPU-side initialization + if(do_verification == 2) { - case 0: - // Zero initialization - out_device_buf.SetZero(); - wei_device_buf.SetZero(); - break; - case 1: - // Discrete integer values in range [-5, 5] - out_device_buf.FillUniformRandInteger(-5, 5); - wei_device_buf.FillUniformRandInteger(-5, 5); - break; - case 2: - // Continuous float values - out_device_buf.FillUniformRandFp(0.0f, 1.0f); - wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); - break; - default: - // Constant value 1 - out_device_buf.SetValue(ck::type_convert(1)); - wei_device_buf.SetValue(ck::type_convert(1)); + // GPU-side initialization for GPU verification workflow + switch(init_method) + { + case 0: + // Zero initialization + out_device_buf.SetZero(); + wei_device_buf.SetZero(); + break; + case 1: + // Discrete integer values in range [-5, 5] + out_device_buf.FillUniformRandInteger(-5, 5); + wei_device_buf.FillUniformRandInteger(-5, 5); + break; + case 2: + // Continuous float values + out_device_buf.FillUniformRandFp(0.0f, 1.0f); + wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); + break; + default: + // Constant value 1 + out_device_buf.SetValue(ck::type_convert(1)); + wei_device_buf.SetValue(ck::type_convert(1)); + } } - - // Create host tensors (needed only for verification) - Tensor out(out_g_n_k_wos_desc); - Tensor wei(wei_g_k_c_xs_desc); - Tensor in_host(in_g_n_c_wis_desc); - Tensor in_device(in_g_n_c_wis_desc); - - // Copy GPU→CPU only if verification is enabled - if(do_verification == 1 || do_verification == 2) + else { - out_device_buf.FromDevice(out.mData.data()); - wei_device_buf.FromDevice(wei.mData.data()); - } + // CPU-side initialization for do_verification=0,1 + switch(init_method) + { + case 0: break; // Tensors are already zero-initialized by default + case 1: + out.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + case 2: + out.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + break; + default: + out.GenerateTensorValue(GeneratorTensor_1{1}); + wei.GenerateTensorValue(GeneratorTensor_1{1}); + } - // Copy to host only if CPU verification is needed - if(do_verification == 1) - { - out_device_buf.FromDevice(out.mData.data()); - wei_device_buf.FromDevice(wei.mData.data()); + // Copy initialized host data to device + out_device_buf.ToDevice(out.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); } // Allocate GPU reference buffer (used only if do_verification == 2) diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index f1498f4c2d..3a9f14e595 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -67,7 +67,13 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl; std::cout << "output: " << out_g_n_k_wos_desc << std::endl; - // Get element space sizes + // Create host tensors + Tensor input(in_g_n_c_wis_desc); + Tensor weight_host_result(wei_g_k_c_xs_desc); + Tensor weight_device_result(wei_g_k_c_xs_desc); + Tensor output(out_g_n_k_wos_desc); + + // Get element space sizes for allocation const auto input_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize(); const auto weight_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); const auto output_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize(); @@ -77,36 +83,48 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_element_space_size); DeviceMem out_device_buf(sizeof(OutDataType) * output_element_space_size); - // Generate data directly on GPU using DeviceMem methods - switch(init_method) + // Initialize tensors based on do_verification: + // - do_verification=2: GPU-side initialization + // - do_verification=0,1: CPU-side initialization + if(do_verification == 2) { - case 0: - // Zero initialization - in_device_buf.SetZero(); - out_device_buf.SetZero(); - break; - case 1: - // Discrete integer values in range [-5, 5] - in_device_buf.FillUniformRandInteger(-5, 5); - out_device_buf.FillUniformRandInteger(-5, 5); - break; - default: - // Continuous float values - in_device_buf.FillUniformRandFp(0.0f, 1.0f); - out_device_buf.FillUniformRandFp(-0.5f, 0.5f); + // GPU-side initialization for GPU verification workflow + switch(init_method) + { + case 0: + // Zero initialization + in_device_buf.SetZero(); + out_device_buf.SetZero(); + break; + case 1: + // Discrete integer values in range [-5, 5] + in_device_buf.FillUniformRandInteger(-5, 5); + out_device_buf.FillUniformRandInteger(-5, 5); + break; + default: + // Continuous float values + in_device_buf.FillUniformRandFp(0.0f, 1.0f); + out_device_buf.FillUniformRandFp(-0.5f, 0.5f); + } } - - // Create host tensors (needed only for verification) - Tensor input(in_g_n_c_wis_desc); - Tensor weight_host_result(wei_g_k_c_xs_desc); - Tensor weight_device_result(wei_g_k_c_xs_desc); - Tensor output(out_g_n_k_wos_desc); - - // Copy to host only if CPU verification is needed - if(do_verification == 1) + else { - in_device_buf.FromDevice(input.mData.data()); - out_device_buf.FromDevice(output.mData.data()); + // CPU-side initialization for do_verification=0,1 + switch(init_method) + { + case 0: break; // Tensors are already zero-initialized by default + case 1: + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + output.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + output.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + // Copy initialized host data to device + in_device_buf.ToDevice(input.mData.data()); + out_device_buf.ToDevice(output.mData.data()); } // Allocate GPU reference buffer (used only if do_verification == 2) diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index 54bb66c42e..bbafdee417 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -86,50 +86,68 @@ bool profile_grouped_conv_fwd_impl(int do_verification, copy(conv_param.input_left_pads_, input_left_pads); copy(conv_param.input_right_pads_, input_right_pads); - // Get element space sizes for GPU allocation - const auto input_size = in_g_n_c_wis_desc.GetElementSpaceSize(); - const auto weight_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); - const auto output_size = out_g_n_k_wos_desc.GetElementSpaceSize(); - std::cout << "input: " << in_g_n_c_wis_desc << std::endl; std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl; std::cout << "output: " << out_g_n_k_wos_desc << std::endl; - // Allocate GPU memory first (GPU-first workflow) - DeviceMem in_device_buf(sizeof(InDataType) * input_size); - DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_size); - DeviceMem out_device_buf(sizeof(OutDataType) * output_size); - - // Generate data directly on GPU using DeviceMem methods - switch(init_method) - { - case 0: - // Zero initialization - in_device_buf.SetZero(); - wei_device_buf.SetZero(); - break; - case 1: - // Discrete integer generation: {-5, -4, -3, ..., 3, 4} - in_device_buf.FillUniformRandInteger(-5, 5); - wei_device_buf.FillUniformRandInteger(-5, 5); - break; - default: - // Continuous float generation - in_device_buf.FillUniformRandFp(0.0f, 1.0f); - wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); - } - - // Create host tensors (for verification if needed) + // Create host tensors Tensor input(in_g_n_c_wis_desc); Tensor weight(wei_g_k_c_xs_desc); Tensor host_output(out_g_n_k_wos_desc); Tensor device_output(out_g_n_k_wos_desc); - // Copy to host only if CPU verification is needed - if(do_verification == 1) + // Get element space sizes for allocation + const auto input_size = in_g_n_c_wis_desc.GetElementSpaceSize(); + const auto weight_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); + const auto output_size = out_g_n_k_wos_desc.GetElementSpaceSize(); + + // Allocate GPU memory + DeviceMem in_device_buf(sizeof(InDataType) * input_size); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_size); + DeviceMem out_device_buf(sizeof(OutDataType) * output_size); + + // Initialize tensors based on do_verification: + // - do_verification=2: GPU-side initialization + // - do_verification=0,1: CPU-side initialization + if(do_verification == 2) { - in_device_buf.FromDevice(input.mData.data()); - wei_device_buf.FromDevice(weight.mData.data()); + // GPU-side initialization for GPU verification workflow + switch(init_method) + { + case 0: + // Zero initialization + in_device_buf.SetZero(); + wei_device_buf.SetZero(); + break; + case 1: + // Discrete integer generation: {-5, -4, -3, ..., 3, 4} + in_device_buf.FillUniformRandInteger(-5, 5); + wei_device_buf.FillUniformRandInteger(-5, 5); + break; + default: + // Continuous float generation + in_device_buf.FillUniformRandFp(0.0f, 1.0f); + wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); + } + } + else + { + // CPU-side initialization for do_verification=0,1 + switch(init_method) + { + case 0: break; // Tensors are already zero-initialized by default + case 1: + input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + weight.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + weight.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + // Copy initialized host data to device + in_device_buf.ToDevice(input.mData.data()); + wei_device_buf.ToDevice(weight.mData.data()); } // Allocate GPU reference buffer (used only if do_verification == 2)