diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index 20bbd58f61..dda1749d48 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -33,13 +33,16 @@ template -bool profile_grouped_conv_bwd_data_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - const ck::utils::conv::ConvParam& conv_param, - ck::index_t split_k = 1, - index_t instance_index = -1) +bool profile_grouped_conv_bwd_data_impl( + int do_verification, + int init_method, + bool do_log, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param, + ck::index_t split_k = 1, + index_t instance_index = -1, + std::optional> init_bounds_out = std::nullopt, + std::optional> init_bounds_wei = std::nullopt) { using OutElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -72,6 +75,16 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_element_space_size); DeviceMem in_device_buf(sizeof(InDataType) * in_element_space_size); + // Initialization bounds for output and weight tensors + // Output: {-5, -4, ..., 3, 4} for integers, [0, 1) for floats. + const auto default_out = + (init_method == 1) ? std::array{-5.0f, 5.0f} : std::array{0.0f, 1.0f}; + const auto [out_min, out_max] = init_bounds_out.value_or(default_out); + // Weight: {-5, -4, ..., 3, 4} for integers, [-0.5, 0.5) for floats. + const auto default_wei = + (init_method == 1) ? std::array{-5.0f, 5.0f} : std::array{-0.5f, 0.5f}; + const auto [wei_min, wei_max] = init_bounds_wei.value_or(default_wei); + // Generate data directly on GPU using DeviceMem methods switch(init_method) { @@ -82,13 +95,13 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, break; case 1: // Discrete integer values in range [-5, 5] - out_device_buf.FillUniformRandInteger(-5, 5); - wei_device_buf.FillUniformRandInteger(-5, 5); + out_device_buf.FillUniformRandInteger(out_min, out_max); + wei_device_buf.FillUniformRandInteger(wei_min, wei_max); break; case 2: // Continuous float values - out_device_buf.FillUniformRandFp(0.0f, 1.0f); - wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); + out_device_buf.FillUniformRandFp(out_min, out_max); + wei_device_buf.FillUniformRandFp(wei_min, wei_max); break; default: // Constant value 1 diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index 54bb66c42e..7481129156 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -5,6 +5,7 @@ #include #include +#include #include #include "ck/ck.hpp" @@ -39,13 +40,16 @@ template -bool profile_grouped_conv_fwd_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - const ck::utils::conv::ConvParam& conv_param, - const OutElementOp out_element_op = OutElementOp{}, - index_t instance_index = -1) +bool profile_grouped_conv_fwd_impl( + int do_verification, + int init_method, + bool do_log, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param, + const OutElementOp out_element_op = OutElementOp{}, + index_t instance_index = -1, + std::optional> init_bounds_in = std::nullopt, + std::optional> init_bounds_wei = std::nullopt) { using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; @@ -100,6 +104,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification, DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_size); DeviceMem out_device_buf(sizeof(OutDataType) * output_size); + // Initialization bounds for input and weight tensors + // Input: {-5, -4, ..., 3, 4} for integers, [0, 1) for floats. + const auto default_in = + (init_method == 1) ? std::array{-5.0f, 5.0f} : std::array{0.0f, 1.0f}; + const auto [in_min, in_max] = init_bounds_in.value_or(default_in); + // Weight: {-5, -4, ..., 3, 4} for integers, [-0.5, 0.5) for floats. + const auto default_wei = + (init_method == 1) ? std::array{-5.0f, 5.0f} : std::array{-0.5f, 0.5f}; + const auto [wei_min, wei_max] = init_bounds_wei.value_or(default_wei); + // Generate data directly on GPU using DeviceMem methods switch(init_method) { @@ -109,14 +123,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification, wei_device_buf.SetZero(); break; case 1: - // Discrete integer generation: {-5, -4, -3, ..., 3, 4} - in_device_buf.FillUniformRandInteger(-5, 5); - wei_device_buf.FillUniformRandInteger(-5, 5); + // Discrete integer generation + in_device_buf.FillUniformRandInteger(in_min, in_max); + wei_device_buf.FillUniformRandInteger(wei_min, wei_max); break; default: // Continuous float generation - in_device_buf.FillUniformRandFp(0.0f, 1.0f); - wei_device_buf.FillUniformRandFp(-0.5f, 0.5f); + in_device_buf.FillUniformRandFp(in_min, in_max); + wei_device_buf.FillUniformRandFp(wei_min, wei_max); } // Create host tensors (for verification if needed) diff --git a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_large_cases.cpp b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_large_cases.cpp index 207b085e1a..a83ee04109 100644 --- a/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_large_cases.cpp +++ b/test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_large_cases.cpp @@ -40,11 +40,14 @@ class TestGroupedConvndBwdData : public ::testing::Test DataType, DataType>( true, // do_verification - 1, // init_method: integer value + 1, // init_method: float value false, // do_log false, // time_kernel param, - split_k); + split_k, + -1, + std::optional>{{0, 5}}, + std::optional>{{0, 5}}); } } EXPECT_TRUE(pass); diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases.cpp index c51918e98f..4197a2ab8d 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_large_cases.cpp @@ -7,6 +7,7 @@ #include #include +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" #include "profiler/profile_grouped_conv_fwd_impl.hpp" template @@ -39,10 +40,14 @@ class TestGroupedConvndFwd : public ::testing::Test DataType, IndexType>( true, // do_verification - 1, // init_method: integer value + 1, // init_method: float value false, // do_log false, // time_kernel - param); + param, + ck::tensor_operation::element_wise::PassThrough{}, + -1, + std::optional>{{0, 5}}, + std::optional>{{0, 5}}); } EXPECT_TRUE(pass); }