Fix validation.

This commit is contained in:
Ville Pietilä
2025-10-17 13:07:06 +00:00
parent 6789c219c1
commit 7722f901df
6 changed files with 80 additions and 64 deletions

View File

@@ -49,7 +49,7 @@ using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple<
// GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 16, 16, 64, 1, 1, 1, 16, 16, 32, 4, 4, 2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 16, 16, 64, 1, 1, 1, 16, 16, 32, 4, 4, 2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 16, 16, 64, 1, 1, 1, 16, 16, 32, 4, 4, 4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 16, 16, 64, 1, 1, 1, 16, 16, 32, 4, 4, 4, true, CK_TILE_PIPELINE_COMPUTE_V4>
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 16, 16, 64, 1, 1, 1, 16, 16, 32, 2, 2, 2, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;

View File

@@ -32,7 +32,7 @@ template <ck_tile::index_t NDimSpatial,
struct GroupedConvolutionBackwardWeightBaseInvoker
{
virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0;
virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const = 0;
virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
virtual std::string GetName(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0;
GroupedConvolutionBackwardWeightBaseInvoker() = default;
GroupedConvolutionBackwardWeightBaseInvoker(const GroupedConvolutionBackwardWeightBaseInvoker&) = default;
@@ -197,7 +197,7 @@ struct GroupedConvolutionBackwardWeightInvoker :
return Kernel::IsSupportedArgument(args);
};
float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const override
float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
{
const ck_tile::index_t gemm_k =
args.N_ * std::accumulate(args.output_spatial_lengths_.begin(),
@@ -226,8 +226,6 @@ struct GroupedConvolutionBackwardWeightInvoker :
const dim3 grids = Kernel::GridSize(args);
const dim3 blocks = Kernel::BlockSize();
constexpr int n_warmup = 5;
constexpr int n_repeat = 50;
ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
ave_time = ck_tile::launch_kernel(

View File

@@ -19,10 +19,10 @@ using tile_grouped_conv_fwd_bf16_instances = std::tuple<
//##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 32, 32, 16, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 32, 32, 8, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 4, 4, 4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 32, 32, 16, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 32, 32, 8, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 4, 4, 4, false, CK_TILE_PIPELINE_COMPUTE_V3>
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 1, 1, 1, 32, 32, 8, 2, 2, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 1, 1, 1, 32, 32, 16, 2, 2, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,

View File

@@ -32,7 +32,7 @@ template <ck_tile::index_t NDimSpatial,
struct GroupedConvolutionForwardBaseInvoker
{
virtual bool IsSupportedArgument(const ck_tile::GroupedConvFwdHostArgs& args) const = 0;
virtual float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const = 0;
virtual float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
virtual std::string GetName(const ck_tile::GroupedConvFwdHostArgs& args) const = 0;
GroupedConvolutionForwardBaseInvoker() = default;
GroupedConvolutionForwardBaseInvoker(const GroupedConvolutionForwardBaseInvoker&) = default;
@@ -198,7 +198,7 @@ struct GroupedConvolutionForwardInvoker :
return Kernel::IsSupportedArgument(args);
};
float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const override
float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
{
const ck_tile::index_t gemm_k =
args.C_ * std::accumulate(args.filter_spatial_lengths_.begin(),
@@ -227,8 +227,6 @@ struct GroupedConvolutionForwardInvoker :
const dim3 grids = Kernel::GridSize(args);
const dim3 blocks = Kernel::BlockSize();
constexpr int n_warmup = 5;
constexpr int n_repeat = 50;
ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
ave_time = ck_tile::launch_kernel(

View File

@@ -97,14 +97,6 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
output.SetZero();
}
ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
input_dev_buf.ToDevice(input.data());
weight_dev_buf.SetZero();
output_dev_buf.ToDevice(output.data());
using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker<
NDimSpatial,
InLayout,
@@ -154,6 +146,14 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
auto split_k_value = split_k_list[split_k_id];
auto split_k_param_str = std::to_string(split_k_value);
ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
input_dev_buf.ToDevice(input.data());
weight_dev_buf.SetZero();
output_dev_buf.ToDevice(output.data());
ck_tile::GroupedConvBwdWeightHostArgs args(conv_param,
input_dev_buf.GetDeviceBuffer(),
weight_dev_buf.GetDeviceBuffer(),
@@ -174,22 +174,16 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
std::string op_name = op->GetName(args);
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;
float avg_time = op->Run(args, time_kernel);
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
<< split_k_param_str << std::endl;
// Run verification first. If it doesn't pass, no need to do performance measurement.
bool pass = false;
if(do_verification)
{
constexpr int n_warmup = 0;
constexpr int n_repeat = 1;
op->Run(args, false, n_warmup, n_repeat);
weight_dev_buf.FromDevice(weight.data());
ck_tile::HostTensor<WeiDataType> weight_host_ref(wei_g_k_c_xs_desc);
weight_host_ref.SetZero();
@@ -223,13 +217,30 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
bool is_valid = do_verification ? pass : true;
if(tflops > best_tflops && is_valid)
if (is_valid)
{
best_op_name = op_name;
best_tflops = tflops;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_split_k = split_k_param_str;
constexpr int n_warmup = 5;
constexpr int n_repeat = 50;
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
<< split_k_param_str << std::endl;
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
best_split_k = split_k_param_str;
}
}
}
else

View File

@@ -100,14 +100,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
weight.SetZero();
}
ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
input_dev_buf.ToDevice(input.data());
weight_dev_buf.ToDevice(output.data());
output_dev_buf.SetZero();
using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker<
NDimSpatial,
InLayout,
@@ -136,6 +128,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
bool all_pass = true;
for(auto& op : ops)
{
ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
input_dev_buf.ToDevice(input.data());
weight_dev_buf.ToDevice(output.data());
output_dev_buf.SetZero();
ck_tile::GroupedConvFwdHostArgs args(conv_param,
input_dev_buf.GetDeviceBuffer(),
weight_dev_buf.GetDeviceBuffer(),
@@ -155,20 +155,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
std::string op_name = op->GetName(args);
std::cout << op_name << " is profiled..." << std::endl;
float avg_time = op->Run(args, time_kernel);
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
// Run verification first. If it doesn't pass, no need to do performance measurement.
bool pass = false;
if(do_verification)
{
constexpr int n_warmup = 0;
constexpr int n_repeat = 1;
op->Run(args, false, n_warmup, n_repeat);
output_dev_buf.FromDevice(output.data());
ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
@@ -202,13 +196,28 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
}
bool is_valid = do_verification ? pass : true;
if(tflops > best_tflops && is_valid)
if (is_valid)
{
best_op_name = op_name;
best_tflops = tflops;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
constexpr int n_warmup = 5;
constexpr int n_repeat = 50;
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
best_op_name = op_name;
best_tflops = tflops;
best_avg_time = avg_time;
best_gb_per_sec = gb_per_sec;
}
}
}
else