diff --git a/profiler/include/profiler/profile_batched_contraction_multiple_d_impl.hpp b/profiler/include/profiler/profile_batched_contraction_multiple_d_impl.hpp index e1035b37ed..1835a0b43c 100644 --- a/profiler/include/profiler/profile_batched_contraction_multiple_d_impl.hpp +++ b/profiler/include/profiler/profile_batched_contraction_multiple_d_impl.hpp @@ -99,6 +99,23 @@ bool profile_batched_contraction_multiple_d_impl(int do_verification, std::cout << "d_gs_ms_ns: " << d_gs_ms_ns.mDesc << std::endl; std::cout << "e_gs_ms_ns: " << e_gs_ms_ns_host_result.mDesc << std::endl; + // get device op instances + using DeviceOp = ck::tensor_operation::device::DeviceBatchedContractionMultipleD; + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + switch(init_method) { case 0: break; @@ -181,23 +198,6 @@ bool profile_batched_contraction_multiple_d_impl(int do_verification, } } - // get device op instances - using DeviceOp = ck::tensor_operation::device::DeviceBatchedContractionMultipleD; - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << op_ptrs.size() << " instances" << std::endl; - std::string best_op_name; float best_ave_time = 0; float best_tflops = 0; diff --git a/profiler/include/profiler/profile_batched_gemm_impl.hpp b/profiler/include/profiler/profile_batched_gemm_impl.hpp index 11a9da365d..149e659d54 100644 --- a/profiler/include/profiler/profile_batched_gemm_impl.hpp +++ b/profiler/include/profiler/profile_batched_gemm_impl.hpp @@ -85,6 +85,12 @@ bool profile_batched_gemm_impl(int do_verification, std::cout << "b_g_k_n: " << b_g_k_n.mDesc << std::endl; std::cout << "c_g_m_n: " << c_g_m_n_host_result.mDesc << std::endl; + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + switch(init_method) { case 0: break; @@ -129,12 +135,6 @@ bool profile_batched_gemm_impl(int do_verification, b_device_buf.ToDevice(b_g_k_n.mData.data()); c_device_buf.ToDevice(c_g_m_n_device_result.mData.data()); - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << op_ptrs.size() << " instances" << std::endl; - std::string best_op_name; float best_ave_time = 0; float best_tflops = 0; diff --git a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp index 2b0e8c3806..9d56dae54f 100644 --- a/profiler/include/profiler/profile_batchnorm_backward_impl.hpp +++ b/profiler/include/profiler/profile_batchnorm_backward_impl.hpp @@ -63,6 +63,27 @@ bool profile_batchnorm_backward_impl(bool do_verification, }; } + using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; + + // add device batchnorm-backward instances + using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd; + + // get device op instances + const auto instance_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; + // input data of the batchnorm backward algorithm Tensor x(inOutLengths); Tensor dy(inOutLengths); @@ -191,27 +212,6 @@ bool profile_batchnorm_backward_impl(bool do_verification, std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin()); - using PassThroughOp = ck::tensor_operation::element_wise::PassThrough; - - // add device batchnorm-backward instances - using DeviceOp = ck::tensor_operation::device::DeviceBatchNormBwd; - - // get device op instances - const auto instance_ptrs = - ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << instance_ptrs.size() << " instances" << std::endl; - std::string best_instance_name; float best_avg_time = std::numeric_limits::max(); float best_gb_per_sec = 0; diff --git a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp index bf5ffcb5d2..937fb24f5a 100644 --- a/profiler/include/profiler/profile_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_conv_bwd_data_impl.hpp @@ -111,6 +111,23 @@ bool profile_conv_bwd_data_impl(int do_verification, std::cout << "weight: " << weight.mDesc << std::endl; std::cout << "output: " << output.mDesc << std::endl; + using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + switch(init_method) { case 0: break; @@ -179,23 +196,6 @@ bool profile_conv_bwd_data_impl(int do_verification, gpu_ref_in_dev.FromDevice(gpu_ref_input.mData.data()); } - using DeviceOp = ck::tensor_operation::device::DeviceConvBwdData; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << op_ptrs.size() << " instances" << std::endl; - std::string best_op_name; float best_avg_time = 0; float best_tflops = 0; diff --git a/profiler/include/profiler/profile_conv_fwd_impl.hpp b/profiler/include/profiler/profile_conv_fwd_impl.hpp index 0dc178ef39..44ce422140 100644 --- a/profiler/include/profiler/profile_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_conv_fwd_impl.hpp @@ -89,6 +89,23 @@ bool profile_conv_fwd_impl(int do_verification, std::cout << "weight: " << weight.mDesc << std::endl; std::cout << "output: " << host_output.mDesc << std::endl; + using DeviceOp = ck::tensor_operation::device::DeviceConvFwd; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + switch(init_method) { case 0: break; @@ -158,23 +175,6 @@ bool profile_conv_fwd_impl(int do_verification, gpu_ref_out_dev.FromDevice(gpu_ref_output.mData.data()); } - using DeviceOp = ck::tensor_operation::device::DeviceConvFwd; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << op_ptrs.size() << " instances" << std::endl; - std::string best_op_name; float best_avg_time = 0; float best_tflops = 0; diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp index aff47e282e..8a5bf966b7 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp @@ -115,6 +115,28 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, std::cout << "wei: " << wei_g_k_c_xs_desc << std::endl; std::cout << "in: " << in_g_n_c_wis_desc << std::endl; + using DeviceOp = + ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD, + InLayout, + OutDataType, + WeiDataType, + ck::Tuple<>, + InDataType, + OutElementOp, + WeiElementOp, + InElementOp, + ComputeDataType, + ComputeDataType>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + // Create host tensors Tensor out(out_g_n_k_wos_desc); Tensor wei(wei_g_k_c_xs_desc); @@ -438,26 +460,6 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, }; // do GEMM - using DeviceOp = - ck::tensor_operation::device::DeviceGroupedConvBwdDataMultipleD, - InLayout, - OutDataType, - WeiDataType, - ck::Tuple<>, - InDataType, - OutElementOp, - WeiElementOp, - InElementOp, - ComputeDataType, - ComputeDataType>; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - std::array out_lengths{}; std::array out_strides{}; std::array wei_lengths{}; diff --git a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp index 9d57f38790..9b7a68224f 100644 --- a/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_bwd_weight_impl.hpp @@ -118,6 +118,25 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl; std::cout << "output: " << out_g_n_k_wos_desc << std::endl; + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + // Create host tensors Tensor input(in_g_n_c_wis_desc); Tensor weight_host_result(wei_g_k_c_xs_desc); @@ -244,25 +263,6 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, } } - using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeight; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << op_ptrs.size() << " instances" << std::endl; - std::string best_op_name; float best_avg_time = 0; float best_tflops = 0; diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp index 2a282edbc8..c46ca57313 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_bias_clamp_impl.hpp @@ -129,6 +129,28 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, std::cout << "output: " << host_output.mDesc << std::endl; std::cout << "bias: " << bias.mDesc << std::endl; + using DeviceOp = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + AComputeType, + BComputeType>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; + switch(init_method) { case 0: break; @@ -339,28 +361,6 @@ bool profile_grouped_conv_fwd_bias_clamp_impl(int do_verification, } }; - using DeviceOp = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, - OutLayout, - InDataType, - WeiDataType, - ck::Tuple, - OutDataType, - InElementOp, - WeiElementOp, - OutElementOp, - AComputeType, - BComputeType>; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; - if constexpr(BiasGK) { constexpr ck::index_t spatial_offset = 3; diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_bilinear_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_bilinear_impl.hpp index b439428cda..4198980fab 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_bilinear_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_bilinear_impl.hpp @@ -107,6 +107,27 @@ bool profile_grouped_conv_fwd_bilinear_impl( std::cout << "d_tensor: " << d_tensor.mDesc << std::endl; std::cout << "output: " << host_output.mDesc << std::endl; + using DeviceOp = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + AComputeType, + BComputeType>; + + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + switch(init_method) { case 0: break; @@ -231,27 +252,6 @@ bool profile_grouped_conv_fwd_bilinear_impl( float best_gb_per_sec = 0; int valids = 0; - using DeviceOp = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, - OutLayout, - InDataType, - WeiDataType, - ck::Tuple, - OutDataType, - InElementOp, - WeiElementOp, - OutElementOp, - AComputeType, - BComputeType>; - - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "found " << op_ptrs.size() << " instances" << std::endl; - for(std::size_t i = 0; i < op_ptrs.size(); ++i) { auto& op_ptr = op_ptrs[i]; diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp index bf8da55f3c..44d000f8c6 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_impl.hpp @@ -145,6 +145,27 @@ bool profile_grouped_conv_fwd_impl(int do_verification, std::cout << "weight: " << wei_g_k_c_xs_desc << std::endl; std::cout << "output: " << out_g_n_k_wos_desc << std::endl; + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + AComputeType, + BComputeType>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; + // Create host tensors Tensor input(in_g_n_c_wis_desc); Tensor weight(wei_g_k_c_xs_desc); @@ -411,27 +432,6 @@ bool profile_grouped_conv_fwd_impl(int do_verification, } }; - using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, - OutLayout, - InDataType, - WeiDataType, - ck::Tuple<>, - OutDataType, - InElementOp, - WeiElementOp, - OutElementOp, - AComputeType, - BComputeType>; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; - if(list_instances) { std::cout << "\nValid instances for this problem:" << std::endl; diff --git a/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp b/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp index ff93105e29..952f01a9ed 100644 --- a/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp +++ b/profiler/include/profiler/profile_grouped_conv_fwd_outelementop_impl.hpp @@ -151,6 +151,27 @@ bool profile_grouped_conv_fwd_outelementop_impl(int do_verification, std::cout << "scale_wei: " << scale_wei << std::endl; std::cout << "scale_out: " << scale_out << std::endl; + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + AComputeType, + BComputeType>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; + // run reference op if(do_verification == 1) { @@ -340,27 +361,6 @@ bool profile_grouped_conv_fwd_outelementop_impl(int do_verification, } }; - using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, - OutLayout, - InDataType, - WeiDataType, - ck::Tuple<>, - OutDataType, - InElementOp, - WeiElementOp, - OutElementOp, - AComputeType, - BComputeType>; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; - for(auto& op_ptr : op_ptrs) { auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(), diff --git a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_bilinear.cpp b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_bilinear.cpp index f1a6cd843f..0a6d8472cd 100644 --- a/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_bilinear.cpp +++ b/test/grouped_convnd_bwd_weight/test_grouped_convnd_bwd_weight_bilinear.cpp @@ -140,6 +140,25 @@ class TestGroupedConvndBwdWeight : public ::testing::Test std::cout << "wei: " << wei_host.mDesc << std::endl; std::cout << "out: " << out.mDesc << std::endl; + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD< + NDimSpatial, + InLayout, + WeiLayout, + OutLayout, + ck::Tuple, + InDataType, + WeiDataType, + OutDataType, + ck::Tuple, + InElementOp, + WeiElementOp, + OutElementOp>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); out.GenerateTensorValue(GeneratorTensor_2{-5, 5}); d.GenerateTensorValue(GeneratorTensor_2{-5, 5}); @@ -179,23 +198,6 @@ class TestGroupedConvndBwdWeight : public ::testing::Test RunReference(conv_param, in, wei_host, out, d); - using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvBwdWeightMultipleD< - NDimSpatial, - InLayout, - WeiLayout, - OutLayout, - ck::Tuple, - InDataType, - WeiDataType, - OutDataType, - ck::Tuple, - InElementOp, - WeiElementOp, - OutElementOp>; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); int num_kernel = 0; for(std::size_t i = 0; i < op_ptrs.size(); ++i) diff --git a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_scaleadd_ab.cpp b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_scaleadd_ab.cpp index e78e61f707..46c111e2b5 100644 --- a/test/grouped_convnd_fwd/test_grouped_convnd_fwd_scaleadd_ab.cpp +++ b/test/grouped_convnd_fwd/test_grouped_convnd_fwd_scaleadd_ab.cpp @@ -104,6 +104,27 @@ bool profile_grouped_conv_fwd_scaleadd_ab_impl(int do_verification, std::cout << "weight: " << weight.mDesc << std::endl; std::cout << "output: " << host_output.mDesc << std::endl; + // InDataType and WeiDataType must be tuple, inLayout and weiLayout are single. + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple<>, + OutLayout, + ck::Tuple, + ck::Tuple, + ck::Tuple<>, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; + switch(init_method) { case 0: break; @@ -246,27 +267,6 @@ bool profile_grouped_conv_fwd_scaleadd_ab_impl(int do_verification, } }; - // InDataType and WeiDataType must be tuple, inLayout and weiLayout are single. - using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD< - NDimSpatial, - InLayout, - WeiLayout, - ck::Tuple<>, - OutLayout, - ck::Tuple, - ck::Tuple, - ck::Tuple<>, - OutDataType, - InElementOp, - WeiElementOp, - OutElementOp>; - - // get device op instances - const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< - DeviceOp>::GetInstances(); - - std::cout << "ckProfiler found " << op_ptrs.size() << " instances" << std::endl; - std::array as{in_device_buf.GetDeviceBuffer(), in_bias_device_buf.GetDeviceBuffer()}; std::array bs{wei_device_buf.GetDeviceBuffer(),