mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-01 04:07:56 +00:00
Fix validation.
This commit is contained in:
@@ -49,7 +49,7 @@ using tile_grouped_conv_bwd_weight_bf16_instances = std::tuple<
|
||||
// GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 16, 16, 64, 1, 1, 1, 16, 16, 32, 4, 4, 2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 16, 16, 64, 1, 1, 1, 16, 16, 32, 4, 4, 2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 16, 16, 64, 1, 1, 1, 16, 16, 32, 4, 4, 4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 16, 16, 64, 1, 1, 1, 16, 16, 32, 4, 4, 4, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
GroupedConvolutionBackwardWeightInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 16, 16, 64, 1, 1, 1, 16, 16, 32, 2, 2, 2, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ template <ck_tile::index_t NDimSpatial,
|
||||
struct GroupedConvolutionBackwardWeightBaseInvoker
|
||||
{
|
||||
virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0;
|
||||
virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const = 0;
|
||||
virtual float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
|
||||
virtual std::string GetName(const ck_tile::GroupedConvBwdWeightHostArgs& args) const = 0;
|
||||
GroupedConvolutionBackwardWeightBaseInvoker() = default;
|
||||
GroupedConvolutionBackwardWeightBaseInvoker(const GroupedConvolutionBackwardWeightBaseInvoker&) = default;
|
||||
@@ -197,7 +197,7 @@ struct GroupedConvolutionBackwardWeightInvoker :
|
||||
return Kernel::IsSupportedArgument(args);
|
||||
};
|
||||
|
||||
float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel) const override
|
||||
float Run(const ck_tile::GroupedConvBwdWeightHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
|
||||
{
|
||||
const ck_tile::index_t gemm_k =
|
||||
args.N_ * std::accumulate(args.output_spatial_lengths_.begin(),
|
||||
@@ -226,8 +226,6 @@ struct GroupedConvolutionBackwardWeightInvoker :
|
||||
const dim3 grids = Kernel::GridSize(args);
|
||||
const dim3 blocks = Kernel::BlockSize();
|
||||
|
||||
constexpr int n_warmup = 5;
|
||||
constexpr int n_repeat = 50;
|
||||
ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
|
||||
|
||||
ave_time = ck_tile::launch_kernel(
|
||||
|
||||
@@ -19,10 +19,10 @@ using tile_grouped_conv_fwd_bf16_instances = std::tuple<
|
||||
//##############################| Num| InLayout| WeiLayout| OutLayout| InData| WeiData| OutData| In| Wei| Out| K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//##############################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//##############################| Spatial| | | | | | | Operation| Operation| Operation| CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 32, 32, 16, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 32, 32, 8, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 4, 4, 4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 32, 32, 16, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 32, 32, 8, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 2, 4, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 4, 4, 4, false, CK_TILE_PIPELINE_COMPUTE_V3>
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 2, 64, 64, 64, 2, 2, 1, 16, 16, 32, 8, 8, 8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 1, 1, 1, 32, 32, 8, 2, 2, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionForwardInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, 1, 64, 64, 64, 1, 1, 1, 32, 32, 16, 2, 2, 2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
|
||||
@@ -32,7 +32,7 @@ template <ck_tile::index_t NDimSpatial,
|
||||
struct GroupedConvolutionForwardBaseInvoker
|
||||
{
|
||||
virtual bool IsSupportedArgument(const ck_tile::GroupedConvFwdHostArgs& args) const = 0;
|
||||
virtual float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const = 0;
|
||||
virtual float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
|
||||
virtual std::string GetName(const ck_tile::GroupedConvFwdHostArgs& args) const = 0;
|
||||
GroupedConvolutionForwardBaseInvoker() = default;
|
||||
GroupedConvolutionForwardBaseInvoker(const GroupedConvolutionForwardBaseInvoker&) = default;
|
||||
@@ -198,7 +198,7 @@ struct GroupedConvolutionForwardInvoker :
|
||||
return Kernel::IsSupportedArgument(args);
|
||||
};
|
||||
|
||||
float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel) const override
|
||||
float Run(const ck_tile::GroupedConvFwdHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
|
||||
{
|
||||
const ck_tile::index_t gemm_k =
|
||||
args.C_ * std::accumulate(args.filter_spatial_lengths_.begin(),
|
||||
@@ -227,8 +227,6 @@ struct GroupedConvolutionForwardInvoker :
|
||||
const dim3 grids = Kernel::GridSize(args);
|
||||
const dim3 blocks = Kernel::BlockSize();
|
||||
|
||||
constexpr int n_warmup = 5;
|
||||
constexpr int n_repeat = 50;
|
||||
ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
|
||||
|
||||
ave_time = ck_tile::launch_kernel(
|
||||
|
||||
@@ -97,14 +97,6 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
output.SetZero();
|
||||
}
|
||||
|
||||
ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
|
||||
|
||||
input_dev_buf.ToDevice(input.data());
|
||||
weight_dev_buf.SetZero();
|
||||
output_dev_buf.ToDevice(output.data());
|
||||
|
||||
using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
@@ -154,6 +146,14 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
auto split_k_value = split_k_list[split_k_id];
|
||||
auto split_k_param_str = std::to_string(split_k_value);
|
||||
|
||||
ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
|
||||
|
||||
input_dev_buf.ToDevice(input.data());
|
||||
weight_dev_buf.SetZero();
|
||||
output_dev_buf.ToDevice(output.data());
|
||||
|
||||
ck_tile::GroupedConvBwdWeightHostArgs args(conv_param,
|
||||
input_dev_buf.GetDeviceBuffer(),
|
||||
weight_dev_buf.GetDeviceBuffer(),
|
||||
@@ -174,22 +174,16 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
std::string op_name = op->GetName(args);
|
||||
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;
|
||||
|
||||
float avg_time = op->Run(args, time_kernel);
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
|
||||
<< split_k_param_str << std::endl;
|
||||
|
||||
// Run verification first. If it doesn't pass, no need to do performance measurement.
|
||||
bool pass = false;
|
||||
if(do_verification)
|
||||
{
|
||||
constexpr int n_warmup = 0;
|
||||
constexpr int n_repeat = 1;
|
||||
|
||||
op->Run(args, false, n_warmup, n_repeat);
|
||||
weight_dev_buf.FromDevice(weight.data());
|
||||
|
||||
ck_tile::HostTensor<WeiDataType> weight_host_ref(wei_g_k_c_xs_desc);
|
||||
weight_host_ref.SetZero();
|
||||
|
||||
@@ -223,13 +217,30 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
|
||||
bool is_valid = do_verification ? pass : true;
|
||||
|
||||
if(tflops > best_tflops && is_valid)
|
||||
if (is_valid)
|
||||
{
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_split_k = split_k_param_str;
|
||||
constexpr int n_warmup = 5;
|
||||
constexpr int n_repeat = 50;
|
||||
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
|
||||
<< split_k_param_str << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
best_split_k = split_k_param_str;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
|
||||
@@ -100,14 +100,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
weight.SetZero();
|
||||
}
|
||||
|
||||
ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
|
||||
|
||||
input_dev_buf.ToDevice(input.data());
|
||||
weight_dev_buf.ToDevice(output.data());
|
||||
output_dev_buf.SetZero();
|
||||
|
||||
using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
@@ -136,6 +128,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
bool all_pass = true;
|
||||
for(auto& op : ops)
|
||||
{
|
||||
ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
|
||||
ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
|
||||
|
||||
input_dev_buf.ToDevice(input.data());
|
||||
weight_dev_buf.ToDevice(output.data());
|
||||
output_dev_buf.SetZero();
|
||||
|
||||
ck_tile::GroupedConvFwdHostArgs args(conv_param,
|
||||
input_dev_buf.GetDeviceBuffer(),
|
||||
weight_dev_buf.GetDeviceBuffer(),
|
||||
@@ -155,20 +155,14 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
std::string op_name = op->GetName(args);
|
||||
std::cout << op_name << " is profiled..." << std::endl;
|
||||
|
||||
float avg_time = op->Run(args, time_kernel);
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
// Run verification first. If it doesn't pass, no need to do performance measurement.
|
||||
bool pass = false;
|
||||
if(do_verification)
|
||||
{
|
||||
constexpr int n_warmup = 0;
|
||||
constexpr int n_repeat = 1;
|
||||
|
||||
op->Run(args, false, n_warmup, n_repeat);
|
||||
output_dev_buf.FromDevice(output.data());
|
||||
|
||||
ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
|
||||
@@ -202,13 +196,28 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
}
|
||||
|
||||
bool is_valid = do_verification ? pass : true;
|
||||
|
||||
if(tflops > best_tflops && is_valid)
|
||||
if (is_valid)
|
||||
{
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
constexpr int n_warmup = 5;
|
||||
constexpr int n_repeat = 50;
|
||||
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_name = op_name;
|
||||
best_tflops = tflops;
|
||||
best_avg_time = avg_time;
|
||||
best_gb_per_sec = gb_per_sec;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
|
||||
Reference in New Issue
Block a user