[CK][Test] Moving device_op creation before data initialization.

Signed-off-by: Michal Kulikowski <Michal.Kulikowski@amd.com>
This commit is contained in:
Michal Kulikowski
2026-03-11 11:39:42 +01:00
committed by Michał Kulikowski
parent 12599a6802
commit a3feb9c1df
13 changed files with 244 additions and 240 deletions

View File

@@ -99,6 +99,23 @@ bool profile_batched_contraction_multiple_d_impl(int do_verification,
std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl;
std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl;
// get device op instances
using DeviceOp = ck::tensor_operation::device::DeviceBatchedContractionMultipleD<NumDimG,
NumDimM,
NumDimN,
NumDimK,
ADataType,
BDataType,
DsDataType,
EDataType,
AElementOp,
BElementOp,
CDEElementOp>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
switch(init_method)
{
case 0: break;
@@ -181,23 +198,6 @@ bool profile_batched_contraction_multiple_d_impl(int do_verification,
}
}
// get device op instances
using DeviceOp = ck::tensor_operation::device::DeviceBatchedContractionMultipleD<NumDimG,
NumDimM,
NumDimN,
NumDimK,
ADataType,
BDataType,
DsDataType,
EDataType,
AElementOp,
BElementOp,
CDEElementOp>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
float best_ave_time = 0;
float best_tflops = 0;

View File

@@ -85,6 +85,12 @@ bool profile_batched_gemm_impl(int do_verification,
std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl;
std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
switch(init_method)
{
case 0: break;
@@ -129,12 +135,6 @@ bool profile_batched_gemm_impl(int do_verification,
b_device_buf.ToDevice(b_g_k_n.mData.data());
c_device_buf.ToDevice(c_g_m_n_device_result.mData.data());
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
float best_ave_time = 0;
float best_tflops = 0;

View File

@@ -63,6 +63,27 @@ bool profile_batchnorm_backward_impl(bool do_verification,
};
}
using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
// add device batchnorm-backward instances
using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
DxDataType,
DxDataType,
AccDataType,
ScaleDataType,
DscaleDbiasDataType,
MeanVarDataType,
PassThroughOp,
Rank,
NumBatchNormReduceDim>;
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
// input data of the batchnorm backward algorithm
Tensor<XDataType> x(inOutLengths);
Tensor<DyDataType> dy(inOutLengths);
@@ -191,27 +212,6 @@ bool profile_batchnorm_backward_impl(bool do_verification,
std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin());
using PassThroughOp = ck::tensor_operation::element_wise::PassThrough;
// add device batchnorm-backward instances
using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd<XDataType,
DxDataType,
DxDataType,
AccDataType,
ScaleDataType,
DscaleDbiasDataType,
MeanVarDataType,
PassThroughOp,
Rank,
NumBatchNormReduceDim>;
// get device op instances
const auto instance_ptrs =
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << instance_ptrs.size() << " instances" << std::endl;
std::string best_instance_name;
float best_avg_time = std::numeric_limits<float>::max();
float best_gb_per_sec = 0;

View File

@@ -111,6 +111,23 @@ bool profile_conv_bwd_data_impl(int do_verification,
std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl;
using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
switch(init_method)
{
case 0: break;
@@ -179,23 +196,6 @@ bool profile_conv_bwd_data_impl(int do_verification,
gpu_ref_in_dev.FromDevice(gpu_ref_input.mData.data());
}
using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
float best_avg_time = 0;
float best_tflops = 0;

View File

@@ -89,6 +89,23 @@ bool profile_conv_fwd_impl(int do_verification,
std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
switch(init_method)
{
case 0: break;
@@ -158,23 +175,6 @@ bool profile_conv_fwd_impl(int do_verification,
gpu_ref_out_dev.FromDevice(gpu_ref_output.mData.data());
}
using DeviceOp = ck::tensor_operation::device::DeviceConvFwd<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
float best_avg_time = 0;
float best_tflops = 0;

View File

@@ -115,6 +115,28 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
std::cout << "wei: " << wei_g_k_c_xs_desc << std::endl;
std::cout << "in: " << in_g_n_c_wis_desc << std::endl;
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
OutLayout,
WeiLayout,
ck::Tuple<>,
InLayout,
OutDataType,
WeiDataType,
ck::Tuple<>,
InDataType,
OutElementOp,
WeiElementOp,
InElementOp,
ComputeDataType,
ComputeDataType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
// Create host tensors
Tensor<OutDataType> out(out_g_n_k_wos_desc);
Tensor<WeiDataType> wei(wei_g_k_c_xs_desc);
@@ -438,26 +460,6 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
};
// do GEMM
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD<NDimSpatial,
OutLayout,
WeiLayout,
ck::Tuple<>,
InLayout,
OutDataType,
WeiDataType,
ck::Tuple<>,
InDataType,
OutElementOp,
WeiElementOp,
InElementOp,
ComputeDataType,
ComputeDataType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::array<ck::index_t, NDimSpatial + 3> out_lengths{};
std::array<ck::index_t, NDimSpatial + 3> out_strides{};
std::array<ck::index_t, NDimSpatial + 3> wei_lengths{};

View File

@@ -118,6 +118,25 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl;
std::cout << "output: " << out_g_n_k_wos_desc << std::endl;
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ComputeTypeA,
ComputeTypeB>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
// Create host tensors
Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight_host_result(wei_g_k_c_xs_desc);
@@ -244,25 +263,6 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
}
}
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ComputeTypeA,
ComputeTypeB>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
std::string best_op_name;
float best_avg_time = 0;
float best_tflops = 0;

View File

@@ -129,6 +129,28 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
std::cout << "output: " << host_output.mDesc << std::endl;
std::cout << "bias: " << bias.mDesc << std::endl;
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<OutLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<OutDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
AComputeType,
BComputeType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
switch(init_method)
{
case 0: break;
@@ -339,28 +361,6 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification,
}
};
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<OutLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<OutDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
AComputeType,
BComputeType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
if constexpr(BiasGK)
{
constexpr ck::index_t spatial_offset = 3;

View File

@@ -107,6 +107,27 @@ bool profile_grouped_conv_fwd_bilinear_impl(
std::cout << "d_tensor: " << d_tensor.mDesc << std::endl;
std::cout << "output: " << host_output.mDesc << std::endl;
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<DLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<DDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
AComputeType,
BComputeType>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
switch(init_method)
{
case 0: break;
@@ -231,27 +252,6 @@ bool profile_grouped_conv_fwd_bilinear_impl(
float best_gb_per_sec = 0;
int valids = 0;
using DeviceOp =
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<DLayout>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<DDataType>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
AComputeType,
BComputeType>;
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
for(std::size_t i = 0; i < op_ptrs.size(); ++i)
{
auto& op_ptr = op_ptrs[i];

View File

@@ -145,6 +145,27 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl;
std::cout << "output: " << out_g_n_k_wos_desc << std::endl;
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
AComputeType,
BComputeType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
// Create host tensors
Tensor<InDataType> input(in_g_n_c_wis_desc);
Tensor<WeiDataType> weight(wei_g_k_c_xs_desc);
@@ -411,27 +432,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
}
};
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
AComputeType,
BComputeType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
if(list_instances)
{
std::cout << "\nValid instances for this problem:" << std::endl;

View File

@@ -151,6 +151,27 @@ bool profile_grouped_conv_fwd_outelementop_impl(int do_verification,
std::cout << "scale_wei: " << scale_wei << std::endl;
std::cout << "scale_out: " << scale_out << std::endl;
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
AComputeType,
BComputeType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
// run reference op
if(do_verification == 1)
{
@@ -340,27 +361,6 @@ bool profile_grouped_conv_fwd_outelementop_impl(int do_verification,
}
};
using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NDimSpatial,
InLayout,
WeiLayout,
ck::Tuple<>,
OutLayout,
InDataType,
WeiDataType,
ck::Tuple<>,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
AComputeType,
BComputeType>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl;
for(auto& op_ptr : op_ptrs)
{
auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(),