diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index d74cf57649..3dc679860e 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -58,37 +58,63 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, const auto in_g_n_c_wis_desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_param); + std::cout << "out: " << out_g_n_k_wos_desc << std::endl; + std::cout << "wei: " << wei_g_k_c_xs_desc << std::endl; + std::cout << "in: " << in_g_n_c_wis_desc << std::endl; + + // Get element space sizes + const auto out_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize(); + const auto wei_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); + const auto in_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize(); + + // Allocate GPU buffers + DeviceMem out_device_buf(sizeof(OutDataType) * out_element_space_size); + DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_element_space_size); + DeviceMem in_device_buf(sizeof(InDataType) * in_element_space_size); + + // Generate data directly on GPU using DeviceMem methods + switch(init_method) + { + case 0: + // Zero initialization + out_device_buf.SetZero(); + wei_device_buf.SetZero(); + break; + case 1: + // Discrete integer values in range [-5, 5] + out_device_buf.FillUniformRandInteger(-5, 5); + wei_device_buf.FillUniformRandInteger(-5, 5); + break; + case 2: + // Continuous float values + out_device_buf.FillUniformRandFp(0.0f, 1.0f); + wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); + break; + default: + // Constant value 1 + out_device_buf.SetValue(ck::type_convert(1)); + wei_device_buf.SetValue(ck::type_convert(1)); + } + + // Create host tensors (needed only for verification) Tensor out(out_g_n_k_wos_desc); Tensor wei(wei_g_k_c_xs_desc); Tensor in_host(in_g_n_c_wis_desc); Tensor in_device(in_g_n_c_wis_desc); - std::cout << "out: " << out.mDesc << std::endl; - std::cout << "wei: " << wei.mDesc << std::endl; - std::cout << "in: " << in_host.mDesc << std::endl; - - switch(init_method) + // Copy GPU→CPU only if verification is enabled + if(do_verification == 1 || do_verification == 2) { - case 0: break; - case 1: - out.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - case 2: - out.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - break; - default: - out.GenerateTensorValue(GeneratorTensor_1{1}); - wei.GenerateTensorValue(GeneratorTensor_1{1}); + out_device_buf.FromDevice(out.mData.data()); + wei_device_buf.FromDevice(wei.mData.data()); } - DeviceMem out_device_buf(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); - DeviceMem in_device_buf(sizeof(InDataType) * in_device.mDesc.GetElementSpaceSize()); - - out_device_buf.ToDevice(out.mData.data()); - wei_device_buf.ToDevice(wei.mData.data()); + // Copy to host only if CPU verification is needed + if(do_verification == 1) + { + out_device_buf.FromDevice(out.mData.data()); + wei_device_buf.FromDevice(wei.mData.data()); + } // Allocate GPU reference buffer (used only if do_verification == 2) DeviceMem gpu_ref_in_buf( diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 67ad21c572..f2698537a3 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -63,35 +63,52 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, const auto out_g_n_k_wos_desc = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed(conv_param); + std::cout << "input: " << in_g_n_c_wis_desc << std::endl; + std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl; + std::cout << "output: " << out_g_n_k_wos_desc << std::endl; + + // Get element space sizes + const auto input_element_space_size = in_g_n_c_wis_desc.GetElementSpaceSize(); + const auto weight_element_space_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); + const auto output_element_space_size = out_g_n_k_wos_desc.GetElementSpaceSize(); + + // Allocate GPU buffers + DeviceMem in_device_buf(sizeof(InDataType) * input_element_space_size); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_element_space_size); + DeviceMem out_device_buf(sizeof(OutDataType) * output_element_space_size); + + // Generate data directly on GPU using DeviceMem methods + switch(init_method) + { + case 0: + // Zero initialization + in_device_buf.SetZero(); + out_device_buf.SetZero(); + break; + case 1: + // Discrete integer values in range [-5, 5] + in_device_buf.FillUniformRandInteger(-5, 5); + out_device_buf.FillUniformRandInteger(-5, 5); + break; + default: + // Continuous float values + in_device_buf.FillUniformRandFp(0.0f, 1.0f); + out_device_buf.FillUniformRandFp(-0.5f, 0.5f); + } + + // Create host tensors (needed only for verification) Tensor input(in_g_n_c_wis_desc); Tensor weight_host_result(wei_g_k_c_xs_desc); Tensor weight_device_result(wei_g_k_c_xs_desc); Tensor output(out_g_n_k_wos_desc); - std::cout << "input: " << input.mDesc << std::endl; - std::cout << "weight: " << weight_host_result.mDesc << std::endl; - std::cout << "output: " << output.mDesc << std::endl; - - switch(init_method) + // Copy to host only if CPU verification is needed + if(do_verification == 1) { - case 0: break; - case 1: - input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - output.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - output.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + in_device_buf.FromDevice(input.mData.data()); + out_device_buf.FromDevice(output.mData.data()); } - DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * - weight_device_result.mDesc.GetElementSpaceSize()); - DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpaceSize()); - - in_device_buf.ToDevice(input.mData.data()); - out_device_buf.ToDevice(output.mData.data()); - // Allocate GPU reference buffer (used only if do_verification == 2) DeviceMem gpu_ref_wei_buf( do_verification == 2 ? sizeof(WeiDataType) * weight_host_result.mDesc.GetElementSpaceSize() diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index 874d1e115c..95b75ecff2 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -86,34 +86,52 @@ bool profile_grouped_conv_fwd_impl(int do_verification, copy(conv_param.input_left_pads_, input_left_pads); copy(conv_param.input_right_pads_, input_right_pads); + // Get element space sizes for GPU allocation + const auto input_size = in_g_n_c_wis_desc.GetElementSpaceSize(); + const auto weight_size = wei_g_k_c_xs_desc.GetElementSpaceSize(); + const auto output_size = out_g_n_k_wos_desc.GetElementSpaceSize(); + + std::cout << "input: " << in_g_n_c_wis_desc << std::endl; + std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl; + std::cout << "output: " << out_g_n_k_wos_desc << std::endl; + + // Allocate GPU memory first (GPU-first workflow) + DeviceMem in_device_buf(sizeof(InDataType) * input_size); + DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_size); + DeviceMem out_device_buf(sizeof(OutDataType) * output_size); + + // Generate data directly on GPU using DeviceMem methods + switch(init_method) + { + case 0: + // Zero initialization + in_device_buf.SetZero(); + wei_device_buf.SetZero(); + break; + case 1: + // Discrete integer generation: {-5, -4, -3, ..., 3, 4} + in_device_buf.FillUniformRandInteger(-5, 5); + wei_device_buf.FillUniformRandInteger(-5, 5); + break; + default: + // Continuous float generation + in_device_buf.FillUniformRandFp(0.0f, 1.0f); + wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); + } + + // Create host tensors (for verification if needed) Tensor input(in_g_n_c_wis_desc); Tensor weight(wei_g_k_c_xs_desc); Tensor host_output(out_g_n_k_wos_desc); Tensor device_output(out_g_n_k_wos_desc); - std::cout << "input: " << input.mDesc << std::endl; - std::cout << "weight: " << weight.mDesc << std::endl; - std::cout << "output: " << host_output.mDesc << std::endl; - - switch(init_method) + // Copy to host only if CPU verification is needed + if(do_verification == 1) { - case 0: break; - case 1: - input.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - weight.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - input.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - weight.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + in_device_buf.FromDevice(input.mData.data()); + wei_device_buf.FromDevice(weight.mData.data()); } - DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpaceSize()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * weight.mDesc.GetElementSpaceSize()); - DeviceMem out_device_buf(sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize()); - - in_device_buf.ToDevice(input.mData.data()); - wei_device_buf.ToDevice(weight.mData.data()); - // Allocate GPU reference buffer (used only if do_verification == 2) DeviceMem gpu_ref_out_buf( do_verification == 2 ? sizeof(OutDataType) * device_output.mDesc.GetElementSpaceSize() : 0);