diff --git a/profiler/include/profile_conv_bwd_data_impl.hpp b/profiler/include/profile_conv_bwd_data_impl.hpp deleted file mode 100644 index dfec033737..0000000000 --- a/profiler/include/profile_conv_bwd_data_impl.hpp +++ /dev/null @@ -1,284 +0,0 @@ -#pragma once - -#include "check_err.hpp" -#include "config.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "tensor_layout.hpp" -#include "device_tensor.hpp" -#include "device_conv_bwd_data.hpp" -#include "element_wise_operation.hpp" -#include "reference_conv_bwd_data.hpp" - -using F16 = ck::half_t; -using F32 = float; -using BF16 = ck::bhalf_t; -using INT8 = int8_t; -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_conv2d_bwd_data_instance { - -using DeviceConvBwdDataNoOpPtr = - DeviceConvBwdDataPtr; -void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances( - std::vector&); -void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances( - std::vector&); -void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances( - std::vector&); -void add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances( - std::vector&); -} // namespace device_conv2d_bwd_data_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -namespace ck { -namespace profiler { - -template -void profile_conv_bwd_data_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - ck::index_t N, - ck::index_t K, - ck::index_t C, - std::vector input_spatial_lengths, - std::vector filter_spatial_lengths, - std::vector output_spatial_lengths, - std::vector conv_filter_strides, - std::vector conv_filter_dilations, - std::vector input_left_pads, - std::vector input_right_pads) -{ - const ck::index_t Y = filter_spatial_lengths[0]; - const ck::index_t X = filter_spatial_lengths[1]; - - const ck::index_t Hi = input_spatial_lengths[0]; - const ck::index_t Wi = input_spatial_lengths[1]; - - const ck::index_t Ho = output_spatial_lengths[0]; - const ck::index_t Wo = output_spatial_lengths[1]; - - auto f_host_tensor_descriptor = - [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { - if constexpr(is_same::value || - is_same::value || - is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, H * W, W, 1})); - } - else if constexpr(is_same::value || - is_same::value || - is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, 1, W * C_, C_})); - } - }; - - Tensor in_n_c_hi_wi_host_result(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); - Tensor in_n_c_hi_wi_device_result( - f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); - Tensor wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{})); - Tensor out_n_k_ho_wo(f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); - - std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi_host_result.mDesc << std::endl; - std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; - std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: - out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - out_n_k_ho_wo.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } - - using InElementOp = ck::tensor_operation::element_wise::PassThrough; - using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; - using OutElementOp = ck::tensor_operation::element_wise::PassThrough; - - const auto in_element_op = InElementOp{}; - const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{}; - - if(do_verification) - { - using ReferenceConvBwdDataInstance = - ck::tensor_operation::host::ReferenceConvBwdData; - - auto ref_conv = ReferenceConvBwdDataInstance{}; - auto ref_invoker = ref_conv.MakeInvoker(); - auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi_host_result, - wei_k_c_y_x, - out_n_k_ho_wo, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - in_element_op, - wei_element_op, - out_element_op); - - ref_invoker.Run(ref_argument); - } - - DeviceMem in_device_buf(sizeof(InDataType) * - in_n_c_hi_wi_device_result.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * out_n_k_ho_wo.mDesc.GetElementSpace()); - - out_device_buf.ToDevice(out_n_k_ho_wo.mData.data()); - wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - using DeviceConvBwdDataNoOpPtr = - ck::tensor_operation::device::DeviceConvBwdDataPtr; - - // add device Conv instances - std::vector conv_ptrs; - if constexpr(ck::is_same_v, float> && - ck::is_same_v, float> && - ck::is_same_v, float>) - { - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: - add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); - } - else if constexpr(ck::is_same_v, ck::half_t> && - ck::is_same_v, ck::half_t> && - ck::is_same_v, ck::half_t>) - { - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: - add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); - } - else if constexpr(ck::is_same_v, ck::bhalf_t> && - ck::is_same_v, ck::bhalf_t> && - ck::is_same_v, ck::bhalf_t>) - { - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: - add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); - } - else if constexpr(ck::is_same_v, int8_t> && - ck::is_same_v, int8_t> && - ck::is_same_v, int8_t>) - { - ck::tensor_operation::device::device_conv2d_bwd_data_instance:: - add_device_conv2d_bwd_data_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); - } - - if(conv_ptrs.size() <= 0) - { - throw std::runtime_error("wrong! no device Conv instance found"); - } - - std::string best_conv_name; - float best_ave_time = 0; - float best_tflops = 0; - float best_gb_per_sec = 0; - - // profile device Conv instances - for(auto& conv_ptr : conv_ptrs) - { - auto argument_ptr = conv_ptr->MakeArgumentPointer( - static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - N, - K, - C, - input_spatial_lengths, - filter_spatial_lengths, - output_spatial_lengths, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - in_element_op, - wei_element_op, - out_element_op); - - auto invoker_ptr = conv_ptr->MakeInvokerPointer(); - - if(conv_ptr->IsSupportedArgument(argument_ptr.get())) - { - std::string conv_name = conv_ptr->GetTypeString(); - - float ave_time = - invoker_ptr->Run(argument_ptr.get(), StreamControl{nullptr, time_kernel}); - - std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; - - std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + - sizeof(WeiDataType) * (K * C * Y * X) + - sizeof(OutDataType) * (N * K * Ho * Wo); - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec - << " GB/s, " << conv_name << std::endl; - - if(tflops > best_tflops) - { - best_conv_name = conv_name; - best_tflops = tflops; - best_ave_time = ave_time; - best_gb_per_sec = gb_per_sec; - } - - if(do_verification) - { - in_device_buf.FromDevice(in_n_c_hi_wi_device_result.mData.data()); - - ck::utils::check_err(in_n_c_hi_wi_device_result.mData, - in_n_c_hi_wi_host_result.mData); - - if(do_log) - { - LogRangeAsType(std::cout << "in : ", out_n_k_ho_wo.mData, ",") - << std::endl; - LogRangeAsType(std::cout << "wei: ", wei_k_c_y_x.mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "out_host : ", in_n_c_hi_wi_host_result.mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "out_device: ", in_n_c_hi_wi_device_result.mData, ",") - << std::endl; - } - } - } - } - - std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " - << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl; -} - -} // namespace profiler -} // namespace ck diff --git a/profiler/src/profile_conv_bwd_data.cpp b/profiler/src/profile_conv_bwd_data.cpp deleted file mode 100644 index 206d486ea0..0000000000 --- a/profiler/src/profile_conv_bwd_data.cpp +++ /dev/null @@ -1,195 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "profile_conv_bwd_data_impl.hpp" - -enum struct ConvDataType -{ - F32_F32_F32, // 0 - F16_F16_F16, // 1 - BF16_BF16_BF16, // 2 - INT8_INT8_INT8, // 3 -}; - -enum struct ConvInputLayout -{ - NCHW, // 0 - NHWC, // 1 -}; - -enum struct ConvWeightLayout -{ - KCYX, // 0 - KYXC, // 1 -}; - -enum struct ConvOutputLayout -{ - NKHW, // 0 - NHWK, // 1 -}; - -int profile_conv_bwd_data(int argc, char* argv[]) -{ - if(argc != 25) - { - printf("arg1: tensor operation (conv_bwd: BackwardConvolution)\n"); - printf("arg2: data type (0: fp32; 1: fp16)\n"); - printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n"); - printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n"); - printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n"); - printf("arg6: verification (0: no; 1: yes)\n"); - printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: time kernel (0=n0, 1=yes)\n"); - printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " - "RightPx\n"); - exit(1); - } - - const auto data_type = static_cast(std::stoi(argv[2])); - const auto in_layout = static_cast(std::stoi(argv[3])); - const auto wei_layout = static_cast(std::stoi(argv[4])); - const auto out_layout = static_cast(std::stoi(argv[5])); - const bool do_verification = std::stoi(argv[6]); - const int init_method = std::stoi(argv[7]); - const bool do_log = std::stoi(argv[8]); - const bool time_kernel = std::stoi(argv[9]); - - const ck::index_t N = std::stoi(argv[10]); - const ck::index_t K = std::stoi(argv[11]); - const ck::index_t C = std::stoi(argv[12]); - const ck::index_t Y = std::stoi(argv[13]); - const ck::index_t X = std::stoi(argv[14]); - const ck::index_t Hi = std::stoi(argv[15]); - const ck::index_t Wi = std::stoi(argv[16]); - - const ck::index_t conv_stride_h = std::stoi(argv[17]); - const ck::index_t conv_stride_w = std::stoi(argv[18]); - const ck::index_t conv_dilation_h = std::stoi(argv[19]); - const ck::index_t conv_dilation_w = std::stoi(argv[20]); - const ck::index_t in_left_pad_h = std::stoi(argv[21]); - const ck::index_t in_left_pad_w = std::stoi(argv[22]); - const ck::index_t in_right_pad_h = std::stoi(argv[23]); - const ck::index_t in_right_pad_w = std::stoi(argv[24]); - - const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1; - const ck::index_t XEff = (X - 1) * conv_dilation_w + 1; - - const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; - const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; - - if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC && - wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK) - { - ck::profiler::profile_conv_bwd_data_impl<2, - float, - float, - float, - float, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK>( - do_verification, - init_method, - do_log, - StreamControl{nullptr, time_kernel}, - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - std::vector{conv_stride_h, conv_stride_w}, - std::vector{conv_dilation_h, conv_dilation_w}, - std::vector{in_left_pad_h, in_left_pad_w}, - std::vector{in_right_pad_h, in_right_pad_w}); - } - else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC && - wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK) - { - ck::profiler::profile_conv_bwd_data_impl<2, - ck::half_t, - ck::half_t, - ck::half_t, - float, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK>( - do_verification, - init_method, - do_log, - StreamControl{nullptr, time_kernel}, - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - std::vector{conv_stride_h, conv_stride_w}, - std::vector{conv_dilation_h, conv_dilation_w}, - std::vector{in_left_pad_h, in_left_pad_w}, - std::vector{in_right_pad_h, in_right_pad_w}); - } - else if(data_type == ConvDataType::BF16_BF16_BF16 && in_layout == ConvInputLayout::NHWC && - wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK) - { - ck::profiler::profile_conv_bwd_data_impl<2, - uint16_t, - uint16_t, - uint16_t, - float, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK>( - do_verification, - init_method, - do_log, - StreamControl{nullptr, time_kernel}, - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - std::vector{conv_stride_h, conv_stride_w}, - std::vector{conv_dilation_h, conv_dilation_w}, - std::vector{in_left_pad_h, in_left_pad_w}, - std::vector{in_right_pad_h, in_right_pad_w}); - } - else if(data_type == ConvDataType::INT8_INT8_INT8 && in_layout == ConvInputLayout::NHWC && - wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK) - { - ck::profiler::profile_conv_bwd_data_impl<2, - int8_t, - int8_t, - int8_t, - int32_t, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK>( - do_verification, - init_method, - do_log, - StreamControl{nullptr, time_kernel}, - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - std::vector{conv_stride_h, conv_stride_w}, - std::vector{conv_dilation_h, conv_dilation_w}, - std::vector{in_left_pad_h, in_left_pad_w}, - std::vector{in_right_pad_h, in_right_pad_w}); - } - else - { - throw std::runtime_error("wrong! this Conv data_type & layout is not implemented"); - } - - return 1; -}