Fix large case init bounds

This commit is contained in:
Graner, Johannes
2026-01-16 08:01:42 -05:00
parent 644cdbe3c9
commit 1abca05c6f
4 changed files with 62 additions and 27 deletions

View File

@@ -33,13 +33,16 @@ template <ck::index_t NDimSpatial,
typename WeiDataType,
typename InDataType,
typename ComputeDataType = InDataType>
bool profile_grouped_conv_bwd_data_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
ck::index_t split_k = 1,
index_t instance_index = -1)
bool profile_grouped_conv_bwd_data_impl(
int do_verification,
int init_method,
bool do_log,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
ck::index_t split_k = 1,
index_t instance_index = -1,
std::optional<std::array<float, 2>> init_bounds_out = std::nullopt,
std::optional<std::array<float, 2>> init_bounds_wei = std::nullopt)
{
using OutElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -72,6 +75,16 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_element_space_size);
DeviceMem in_device_buf(sizeof(InDataType) * in_element_space_size);
// Initialization bounds for output and weight tensors
// Output: {-5, -4, ..., 3, 4} for integers, [0, 1) for floats.
const auto default_out =
(init_method == 1) ? std::array<float, 2>{-5.0f, 5.0f} : std::array<float, 2>{0.0f, 1.0f};
const auto [out_min, out_max] = init_bounds_out.value_or(default_out);
// Weight: {-5, -4, ..., 3, 4} for integers, [-0.5, 0.5) for floats.
const auto default_wei =
(init_method == 1) ? std::array<float, 2>{-5.0f, 5.0f} : std::array<float, 2>{-0.5f, 0.5f};
const auto [wei_min, wei_max] = init_bounds_wei.value_or(default_wei);
// Generate data directly on GPU using DeviceMem methods
switch(init_method)
{
@@ -82,13 +95,13 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
break;
case 1:
// Discrete integer values in range [-5, 5]
out_device_buf.FillUniformRandInteger<OutDataType>(-5, 5);
wei_device_buf.FillUniformRandInteger<WeiDataType>(-5, 5);
out_device_buf.FillUniformRandInteger<OutDataType>(out_min, out_max);
wei_device_buf.FillUniformRandInteger<WeiDataType>(wei_min, wei_max);
break;
case 2:
// Continuous float values
out_device_buf.FillUniformRandFp<OutDataType>(0.0f, 1.0f);
wei_device_buf.FillUniformRandFp<WeiDataType>(-0.5f, 0.5f);
out_device_buf.FillUniformRandFp<OutDataType>(out_min, out_max);
wei_device_buf.FillUniformRandFp<WeiDataType>(wei_min, wei_max);
break;
default:
// Constant value 1

View File

@@ -5,6 +5,7 @@
#include <iomanip>
#include <iostream>
#include <optional>
#include <typeinfo>
#include "ck/ck.hpp"
@@ -39,13 +40,16 @@ template <ck::index_t NDimSpatial,
typename BComputeType = AComputeType,
typename IndexType = ck::index_t,
typename OutElementOp = ck::tensor_operation::element_wise::PassThrough>
bool profile_grouped_conv_fwd_impl(int do_verification,
int init_method,
bool do_log,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
const OutElementOp out_element_op = OutElementOp{},
index_t instance_index = -1)
bool profile_grouped_conv_fwd_impl(
int do_verification,
int init_method,
bool do_log,
bool time_kernel,
const ck::utils::conv::ConvParam& conv_param,
const OutElementOp out_element_op = OutElementOp{},
index_t instance_index = -1,
std::optional<std::array<float, 2>> init_bounds_in = std::nullopt,
std::optional<std::array<float, 2>> init_bounds_wei = std::nullopt)
{
using InElementOp = ck::tensor_operation::element_wise::PassThrough;
using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
@@ -100,6 +104,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
DeviceMem wei_device_buf(sizeof(WeiDataType) * weight_size);
DeviceMem out_device_buf(sizeof(OutDataType) * output_size);
// Initialization bounds for input and weight tensors
// Input: {-5, -4, ..., 3, 4} for integers, [0, 1) for floats.
const auto default_in =
(init_method == 1) ? std::array<float, 2>{-5.0f, 5.0f} : std::array<float, 2>{0.0f, 1.0f};
const auto [in_min, in_max] = init_bounds_in.value_or(default_in);
// Weight: {-5, -4, ..., 3, 4} for integers, [-0.5, 0.5) for floats.
const auto default_wei =
(init_method == 1) ? std::array<float, 2>{-5.0f, 5.0f} : std::array<float, 2>{-0.5f, 0.5f};
const auto [wei_min, wei_max] = init_bounds_wei.value_or(default_wei);
// Generate data directly on GPU using DeviceMem methods
switch(init_method)
{
@@ -109,14 +123,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
wei_device_buf.SetZero();
break;
case 1:
// Discrete integer generation: {-5, -4, -3, ..., 3, 4}
in_device_buf.FillUniformRandInteger<InDataType>(-5, 5);
wei_device_buf.FillUniformRandInteger<WeiDataType>(-5, 5);
// Discrete integer generation
in_device_buf.FillUniformRandInteger<InDataType>(in_min, in_max);
wei_device_buf.FillUniformRandInteger<WeiDataType>(wei_min, wei_max);
break;
default:
// Continuous float generation
in_device_buf.FillUniformRandFp<InDataType>(0.0f, 1.0f);
wei_device_buf.FillUniformRandFp<WeiDataType>(-0.5f, 0.5f);
in_device_buf.FillUniformRandFp<InDataType>(in_min, in_max);
wei_device_buf.FillUniformRandFp<WeiDataType>(wei_min, wei_max);
}
// Create host tensors (for verification if needed)

View File

@@ -40,11 +40,14 @@ class TestGroupedConvndBwdData : public ::testing::Test
DataType,
DataType>(
true, // do_verification
1, // init_method: integer value
1, // init_method: float value
false, // do_log
false, // time_kernel
param,
split_k);
split_k,
-1,
std::optional<std::array<float, 2>>{{0, 5}},
std::optional<std::array<float, 2>>{{0, 5}});
}
}
EXPECT_TRUE(pass);

View File

@@ -7,6 +7,7 @@
#include <vector>
#include <gtest/gtest.h>
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "profiler/profile_grouped_conv_fwd_impl.hpp"
template <typename Tuple>
@@ -39,10 +40,14 @@ class TestGroupedConvndFwd : public ::testing::Test
DataType,
IndexType>(
true, // do_verification
1, // init_method: integer value
1, // init_method: float value
false, // do_log
false, // time_kernel
param);
param,
ck::tensor_operation::element_wise::PassThrough{},
-1,
std::optional<std::array<float, 2>>{{0, 5}},
std::optional<std::array<float, 2>>{{0, 5}});
}
EXPECT_TRUE(pass);
}