From f61888f6746354d0bda04f2d23c187eb5e3a7778 Mon Sep 17 00:00:00 2001 From: Adam Osewski <19374865+aosewski@users.noreply.github.com> Date: Fri, 22 Apr 2022 00:39:39 +0200 Subject: [PATCH] Convolution FWD profiler refactor. (#183) * Convolution ND * Code unification across dimensions for generating tensor descriptors. * Example * Instances * Move convnd f32 instance file to comply with repo structure. * Conv 1D tensor layouts. * Formatting and use ReferenceConv * Reference ConvFwd supporting 1D and 2D convolution. * Debug printing TensorLayout name. * Conv fwd 1D instance f32 * Refactor conv ND example. Needed to support various conv dimensio. Needed to support various conv dimensions * Rename conv nd example director to prevent conflicts. * Refactor some common utility to single file. Plus some tests. * Refactor GetHostTensorDescriptor + UT. * Add 1D test case. * Test reference convolution 1d/2d * Remove some leftovers. * Fix convolution example error for 1D * Refactor test check errors utility function. * Test Conv2D Fwd XDL * More UT for 1D case. * Parameterize input & weight initializers. * Rename example to prevent conflicts. * Split convnd instance into separate files for 1d/2d * Address review comments. * Fix data type for flops/gbytes calculations. * Assign example number 11. * 3D cases for convolution utility functions. * 3D reference convolution. * Add support for 3D convolution. * Check for inputs bigger than 2GB. * Formatting * Support for bf16/f16/f32/i8 - conv instances + UT. * Use check_err from test_util.hpp. * Split convnd test into separate files for each dim. * Fix data generation and use proper instances. * Formatting * Skip tensor initialization if not necessary. * Fix CMakefiles. * Remove redundant conv2d_fwd test. * Lower problem size for conv3D UT. * 3D case for convnd example. * Remove leftovers after merge. * Add Conv Specialization string to GetTypeString * Skip instance causing numerical errors. * Small fixes. * Remove redundant includes. * Fix namespace name error. * Script for automatic testing and logging convolution fwd UTs * Comment out numactl cmd. * Refine weights initalization and relax rtol for fp16 * Move test_util.hpp to check_err.hpp * Refine weights initalization and relax rtol for fp16 * Refactor common part of test conv utils. * Move utility function to single common place. * Add additional common functions to utility. * Refactor convnd_fwd_xdl examples. * Remove redundant files. * Unify structure. * Add constructor to ConvParams. * And add input parameters validation. * Modify conv examples to use single utility file. * Remove check_error from host_tensor.hpp * Get rid of check_indices function. * Remove bf16_to_f32 function overload for scalars. * Fix namespace. * Add half_float::half for check_err. * Fix conv params size in UT. * Fix weights initialization for int8. * Fix weights initialization for int8. * Add type_convert when store output in ref conv 1D. * Get back old conv2d_fwd_xdl operation. * Silence conv debug print. * format * clean * clean * Fix merge. * Fix namespace for check_err * Formatting. * Fix merge artifacts. * Remove deleted header. * Fix some includes and use ck::utils::check_err. * Remove unused check_indices restored by previous merge. * Fix namespaces after merge. * Fix compilation error. * Small fixes. * Use common functions. * Fix filename * Fix namespaces. * Fix merge artifact - retrieve removed by accident fun. * Fix ConvForwardSpecialization. * Working example of OpInstanceRunEngine for conv2dfwd UT. * Adhere to coding style rules. * Formatting and adhere to coding style rules. * Fix merge artifacts. * Utility for collecting conv fwd instances. + Plus commmon part for parsing cmdline params. * Refactor FillUniform because of segfault for int8_t. * Naming convention. * Elegant version of device mem allocation. * Use OpInstanceRunEngine in conv fwd nd tests. * Multiple refinements. * conditional init * don't run reference op if not provided. * Use OpInstanceRunEngine for ckProfiler conv_fwd * Refactor common tensor fill function to separate file. * Clean up unused functions. * Support different init methods. * Create CMake target for conv_fwd_util. * Add header for profile_convnd_fwd.cpp * Fix CMakefiles to link with conv_fwd_util where needed. * Fix some clutter. Co-authored-by: Adam Osewski Co-authored-by: Chao Liu [ROCm/composable_kernel commit: 1a0cd5d160dfbe107a454f975a26599fc6daddd4] --- .../06_conv2d_fwd_bias_relu/CMakeLists.txt | 1 + .../CMakeLists.txt | 1 + example/09_convnd_fwd/CMakeLists.txt | 3 + example/10_conv2d_bwd_data/CMakeLists.txt | 1 + example/11_conv2d_bwd_weight/CMakeLists.txt | 1 + example/17_convnd_bwd_data_xdl/CMakeLists.txt | 1 + library/CMakeLists.txt | 1 + .../ck/library/utility/conv_fwd_util.hpp | 647 +++++++++--------- library/include/ck/library/utility/fill.hpp | 81 +++ .../ck/library/utility/op_instance_engine.hpp | 231 +++++++ library/src/utility/CMakeLists.txt | 21 + library/src/utility/conv_fwd_util.cpp | 238 +++++++ profiler/CMakeLists.txt | 6 +- profiler/include/profile_conv_fwd_impl.hpp | 283 -------- profiler/include/profile_convnd_fwd.hpp | 9 + profiler/src/profile_conv_fwd.cpp | 191 ------ profiler/src/profile_convnd_bwd_data.cpp | 4 + profiler/src/profile_convnd_fwd.cpp | 351 ++++++++++ profiler/src/profiler.cpp | 5 +- test/conv2d_bwd_weight/CMakeLists.txt | 3 +- test/conv_util/CMakeLists.txt | 2 +- test/convnd_bwd_data/CMakeLists.txt | 3 +- test/convnd_fwd/CMakeLists.txt | 10 +- test/convnd_fwd/conv1d_fwd.cpp | 110 +-- test/convnd_fwd/conv2d_fwd.cpp | 104 +-- test/convnd_fwd/conv3d_fwd.cpp | 263 +++---- test/convnd_fwd/conv_util.hpp | 24 +- test/reference_conv_fwd/CMakeLists.txt | 2 +- .../reference_conv_fwd/reference_conv_fwd.cpp | 41 +- 29 files changed, 1473 insertions(+), 1165 deletions(-) create mode 100644 library/include/ck/library/utility/fill.hpp create mode 100644 library/include/ck/library/utility/op_instance_engine.hpp create mode 100644 library/src/utility/CMakeLists.txt create mode 100644 library/src/utility/conv_fwd_util.cpp delete mode 100644 profiler/include/profile_conv_fwd_impl.hpp create mode 100644 profiler/include/profile_convnd_fwd.hpp delete mode 100644 profiler/src/profile_conv_fwd.cpp create mode 100644 profiler/src/profile_convnd_fwd.cpp diff --git a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt index d7d7a3f75e..df8f70606c 100644 --- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt +++ b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt @@ -1 +1,2 @@ add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp) +target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util) diff --git a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt index 9dec34cf9a..8bc5980025 100644 --- a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt +++ b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt @@ -1 +1,2 @@ add_example_executable(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp) +target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util) diff --git a/example/09_convnd_fwd/CMakeLists.txt b/example/09_convnd_fwd/CMakeLists.txt index fd6d11d9ff..f602862a04 100644 --- a/example/09_convnd_fwd/CMakeLists.txt +++ b/example/09_convnd_fwd/CMakeLists.txt @@ -1,3 +1,6 @@ add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp) +target_link_libraries(example_convnd_fwd_xdl PRIVATE conv_fwd_util) add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp) +target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util) add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp) +target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util) diff --git a/example/10_conv2d_bwd_data/CMakeLists.txt b/example/10_conv2d_bwd_data/CMakeLists.txt index 6ff4c9bb16..f300bc9645 100644 --- a/example/10_conv2d_bwd_data/CMakeLists.txt +++ b/example/10_conv2d_bwd_data/CMakeLists.txt @@ -1 +1,2 @@ add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp) +target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util) diff --git a/example/11_conv2d_bwd_weight/CMakeLists.txt b/example/11_conv2d_bwd_weight/CMakeLists.txt index bbedb57645..ff001eab72 100644 --- a/example/11_conv2d_bwd_weight/CMakeLists.txt +++ b/example/11_conv2d_bwd_weight/CMakeLists.txt @@ -1 +1,2 @@ add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp) +target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util) diff --git a/example/17_convnd_bwd_data_xdl/CMakeLists.txt b/example/17_convnd_bwd_data_xdl/CMakeLists.txt index 875203b264..0ed906f8f7 100644 --- a/example/17_convnd_bwd_data_xdl/CMakeLists.txt +++ b/example/17_convnd_bwd_data_xdl/CMakeLists.txt @@ -1 +1,2 @@ add_example_executable(example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp) +target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_fwd_util) diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index 7b5523d23b..aa18026932 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -1,2 +1,3 @@ add_subdirectory(src/host_tensor) add_subdirectory(src/tensor_operation_instance/gpu) +add_subdirectory(src/utility) diff --git a/library/include/ck/library/utility/conv_fwd_util.hpp b/library/include/ck/library/utility/conv_fwd_util.hpp index f758b808c3..a29eb814fd 100644 --- a/library/include/ck/library/utility/conv_fwd_util.hpp +++ b/library/include/ck/library/utility/conv_fwd_util.hpp @@ -1,13 +1,10 @@ -#ifndef CONV_FWD_UTIL_HPP -#define CONV_FWD_UTIL_HPP +#pragma once -#include #include #include #include #include #include -#include #include #include #include @@ -18,10 +15,50 @@ #include "device_conv_fwd.hpp" #include "device_tensor.hpp" #include "element_wise_operation.hpp" +#include "fill.hpp" #include "host_tensor.hpp" +#include "op_instance_engine.hpp" #include "reference_conv_fwd.hpp" #include "tensor_layout.hpp" +namespace ck { +namespace tensor_operation { +namespace device { + +using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr; +namespace device_conv1d_fwd_instance { + +void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(std::vector&); +void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(std::vector&); +void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(std::vector&); +void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector&); + +} // namespace device_conv1d_fwd_instance +namespace device_conv2d_fwd_instance { + +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector&); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector&); +void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( + std::vector&); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector&); +void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector&); + +} // namespace device_conv2d_fwd_instance +namespace device_conv3d_fwd_instance { + +void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(std::vector&); +void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(std::vector&); +void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(std::vector&); +void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector&); + +} // namespace device_conv3d_fwd_instance + +} // namespace device +} // namespace tensor_operation +} // namespace ck + namespace ck { namespace utils { namespace conv { @@ -47,20 +84,7 @@ std::size_t get_flops(ck::index_t N, ck::index_t C, ck::index_t K, const std::vector& filter_spatial_lengths, - const std::vector& output_spatial_lengths) -{ - // 2 * N * K * * C * - return static_cast(2) * N * K * - std::accumulate(std::begin(output_spatial_lengths), - std::end(output_spatial_lengths), - static_cast(1), - std::multiplies()) * - C * - std::accumulate(std::begin(filter_spatial_lengths), - std::end(filter_spatial_lengths), - static_cast(1), - std::multiplies()); -} + const std::vector& output_spatial_lengths); /** * @brief Calculate number of bytes read/write by convolution algorithm. @@ -110,20 +134,7 @@ std::size_t get_btype(ck::index_t N, struct ConvParams { - ConvParams() - : num_dim_spatial(2), - N(128), - K(256), - C(192), - filter_spatial_lengths(2, 3), - input_spatial_lengths(2, 71), - conv_filter_strides(2, 2), - conv_filter_dilations(2, 1), - input_left_pads(2, 1), - input_right_pads(2, 1) - { - } - + ConvParams(); ConvParams(ck::index_t n_dim, ck::index_t n_batch, ck::index_t n_out_channels, @@ -133,29 +144,7 @@ struct ConvParams const std::vector& strides, const std::vector& dilations, const std::vector& left_pads, - const std::vector& right_pads) - : num_dim_spatial(n_dim), - N(n_batch), - K(n_out_channels), - C(n_in_channels), - filter_spatial_lengths(filters_len), - input_spatial_lengths(input_len), - conv_filter_strides(strides), - conv_filter_dilations(dilations), - input_left_pads(left_pads), - input_right_pads(right_pads) - { - if(filter_spatial_lengths.size() != num_dim_spatial || - input_spatial_lengths.size() != num_dim_spatial || - conv_filter_strides.size() != num_dim_spatial || - conv_filter_dilations.size() != num_dim_spatial || - input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) - { - throw(std::runtime_error( - "ConvParams::GetOutputSpatialLengths: " - "parameter size is different from number of declared dimensions!")); - } - } + const std::vector& right_pads); ck::index_t num_dim_spatial; ck::index_t N; @@ -171,35 +160,11 @@ struct ConvParams std::vector input_left_pads; std::vector input_right_pads; - std::vector GetOutputSpatialLengths() const - { - if(filter_spatial_lengths.size() != num_dim_spatial || - input_spatial_lengths.size() != num_dim_spatial || - conv_filter_strides.size() != num_dim_spatial || - conv_filter_dilations.size() != num_dim_spatial || - input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) - { - throw(std::runtime_error( - "ConvParams::GetOutputSpatialLengths: " - "parameter size is different from number of declared dimensions!")); - } - - std::vector out_spatial_len(num_dim_spatial, 0); - for(ck::index_t i = 0; i < num_dim_spatial; ++i) - { - // XEff = (X - 1) * conv_dilation_w + 1; - // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; - const ck::index_t idx_eff = - (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1; - out_spatial_len[i] = - (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) / - conv_filter_strides[i] + - 1; - } - return out_spatial_len; - } + std::vector GetOutputSpatialLengths() const; }; +ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[]); + /** * @brief Gets the host tensor descriptor. * @@ -221,13 +186,13 @@ HostTensorDescriptor get_host_tensor_descriptor(const std::vector& std::is_same::value) { - return HostTensorDescriptor(dims, std::vector({C * dims[2], dims[2], 1})); + return HostTensorDescriptor(dims, std::vector{C * dims[2], dims[2], 1}); } else if constexpr(std::is_same::value || std::is_same::value || std::is_same::value) { - return HostTensorDescriptor(dims, std::vector({C * dims[2], 1, C})); + return HostTensorDescriptor(dims, std::vector{C * dims[2], 1, C}); } // 2D else if constexpr(std::is_same::value || @@ -273,132 +238,14 @@ HostTensorDescriptor get_host_tensor_descriptor(const std::vector& throw std::runtime_error(err_msg.str()); } -template -auto get_host_tensors(const ConvParams& params, bool init = true) -{ - std::vector input_dims{static_cast(params.N), - static_cast(params.C)}; - input_dims.insert(std::end(input_dims), - std::begin(params.input_spatial_lengths), - std::end(params.input_spatial_lengths)); - - std::vector filter_dims{static_cast(params.K), - static_cast(params.C)}; - filter_dims.insert(std::end(filter_dims), - std::begin(params.filter_spatial_lengths), - std::end(params.filter_spatial_lengths)); - - const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - std::vector output_dims{static_cast(params.N), - static_cast(params.K)}; - output_dims.insert(std::end(output_dims), - std::begin(output_spatial_lengths), - std::end(output_spatial_lengths)); - - Tensor input(ck::utils::conv::get_host_tensor_descriptor(input_dims, InLayout{})); - Tensor weights( - ck::utils::conv::get_host_tensor_descriptor(filter_dims, WeiLayout{})); - Tensor host_output( - ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{})); - Tensor device_output( - ck::utils::conv::get_host_tensor_descriptor(output_dims, OutLayout{})); - - if(init) - { - std::mt19937 gen(11939); - if constexpr(std::is_same::value) - { - std::uniform_int_distribution<> dis(-5, 5); - std::generate( - input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); }); - std::generate( - weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); }); - } - else - { - std::uniform_real_distribution<> dis(0.f, 1.f); - std::generate( - input.begin(), input.end(), [&dis, &gen]() { return InDataType(dis(gen)); }); - std::generate( - weights.begin(), weights.end(), [&dis, &gen]() { return WeiDataType(dis(gen)); }); - } - std::fill(host_output.begin(), host_output.end(), OutDataType(0.f)); - std::fill(device_output.begin(), device_output.end(), OutDataType(0.f)); - } - - return std::make_tuple(input, weights, host_output, device_output); -} - HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector& dims, - int num_dim_spatial = 2) -{ - namespace tl = ck::tensor_layout::convolution; - - switch(num_dim_spatial) - { - case 3: { - return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWK{}); - } - case 2: { - return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWK{}); - } - case 1: { - return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWK{}); - } - default: { - throw std::runtime_error("Unsupported number of spatial dimensions provided!"); - } - } -} + int num_dim_spatial = 2); HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector& dims, - int num_dim_spatial = 2) -{ - namespace tl = ck::tensor_layout::convolution; - - switch(num_dim_spatial) - { - case 3: { - return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KZYXC{}); - } - case 2: { - return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KYXC{}); - } - case 1: { - return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KXC{}); - } - default: { - throw std::runtime_error("Unsupported number of spatial dimensions provided!"); - } - } -} + int num_dim_spatial = 2); HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector& dims, - int num_dim_spatial = 2) -{ - namespace tl = ck::tensor_layout::convolution; - - switch(num_dim_spatial) - { - case 3: { - return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{}); - } - case 2: { - return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{}); - } - case 1: { - return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{}); - } - default: { - throw std::runtime_error("Unsupported number of spatial dimensions provided!"); - } - } -} + int num_dim_spatial = 2); template - class DeviceConvNDFwdInstance> -void run_convolution_forward(const ConvParams& params, - const Tensor& input, - const Tensor& weights, - Tensor& output) +template +struct ConvolutionFwdInstances; + +template <> +struct ConvolutionFwdInstances { - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace()); - - in_device_buf.ToDevice(input.mData.data()); - wei_device_buf.ToDevice(weights.mData.data()); - const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - - auto conv = DeviceConvNDFwdInstance(); - auto invoker = conv.MakeInvoker(); - auto argument = conv.MakeArgument(static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, - output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, - PassThrough{}, - PassThrough{}, - PassThrough{}); - - if(!conv.IsSupportedArgument(argument)) + template = 1 && NumDimSpatial <= 3, bool>::type = false> + static std::vector Get() { - throw std::runtime_error( - "Error! device_conv with the specified compilation parameters does " - "not support this Conv problem"); - } - - invoker.Run(argument); - out_device_buf.FromDevice(output.mData.data()); -} - -template -bool run_convolution_forward_instances(const ConvParams& params, - const std::vector& conv_ptrs, - const Tensor& input, - const Tensor& weights, - Tensor& output, - const Tensor& host_output) -{ - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - DeviceMem in_device_buf(sizeof(InDataType) * input.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * weights.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * output.mDesc.GetElementSpace()); - - in_device_buf.ToDevice(input.mData.data()); - wei_device_buf.ToDevice(weights.mData.data()); - const std::vector& output_spatial_lengths = params.GetOutputSpatialLengths(); - - bool res{true}; - for(auto& conv_ptr : conv_ptrs) - { - auto invoker = conv_ptr->MakeInvokerPointer(); - auto argument = conv_ptr->MakeArgumentPointer( - static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - params.N, - params.K, - params.C, - params.input_spatial_lengths, - params.filter_spatial_lengths, - output_spatial_lengths, - params.conv_filter_strides, - params.conv_filter_dilations, - params.input_left_pads, - params.input_right_pads, - PassThrough{}, - PassThrough{}, - PassThrough{}); - - if(conv_ptr->IsSupportedArgument(argument.get())) + std::vector conv_ptrs; + if constexpr(NumDimSpatial == 1) { - float atol{1e-5f}; - float rtol{1e-4f}; - if constexpr(std::is_same_v) - { - atol = 1e-4f; - rtol = 2.5e-3f; - } - invoker->Run(argument.get()); - out_device_buf.FromDevice(output.mData.data()); - res = res && - ck::utils::check_err( - output.mData, host_output.mData, "Error: incorrect results!", atol, rtol); - hipGetErrorString( - hipMemset(out_device_buf.GetDeviceBuffer(), 0, out_device_buf.mMemSize)); + ck::tensor_operation::device::device_conv1d_fwd_instance:: + add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs); } + else if constexpr(NumDimSpatial == 2) + { + ck::tensor_operation::device::device_conv2d_fwd_instance:: + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); + } + else if constexpr(NumDimSpatial == 3) + { + ck::tensor_operation::device::device_conv3d_fwd_instance:: + add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs); + } + return conv_ptrs; } - return res; -} +}; + +template <> +struct ConvolutionFwdInstances +{ + template = 1 && NumDimSpatial <= 3, bool>::type = false> + static std::vector Get() + { + std::vector conv_ptrs; + if constexpr(NumDimSpatial == 1) + { + ck::tensor_operation::device::device_conv1d_fwd_instance:: + add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs); + return conv_ptrs; + } + else if constexpr(NumDimSpatial == 2) + { + ck::tensor_operation::device::device_conv2d_fwd_instance:: + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); + ck::tensor_operation::device::device_conv2d_fwd_instance:: + add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); + } + else if constexpr(NumDimSpatial == 3) + { + ck::tensor_operation::device::device_conv3d_fwd_instance:: + add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs); + } + return conv_ptrs; + } +}; + +template <> +struct ConvolutionFwdInstances +{ + template = 1 && NumDimSpatial <= 3, bool>::type = false> + static std::vector Get() + { + std::vector conv_ptrs; + if constexpr(NumDimSpatial == 1) + { + ck::tensor_operation::device::device_conv1d_fwd_instance:: + add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs); + } + else if constexpr(NumDimSpatial == 2) + { + ck::tensor_operation::device::device_conv2d_fwd_instance:: + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); + } + else if constexpr(NumDimSpatial == 3) + { + ck::tensor_operation::device::device_conv3d_fwd_instance:: + add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs); + } + return conv_ptrs; + } +}; + +template <> +struct ConvolutionFwdInstances +{ + template = 1 && NumDimSpatial <= 3, bool>::type = false> + static std::vector Get() + { + std::vector conv_ptrs; + if constexpr(NumDimSpatial == 1) + { + ck::tensor_operation::device::device_conv1d_fwd_instance:: + add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs); + } + else if constexpr(NumDimSpatial == 2) + { + ck::tensor_operation::device::device_conv2d_fwd_instance:: + add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); + } + else if constexpr(NumDimSpatial == 3) + { + ck::tensor_operation::device::device_conv3d_fwd_instance:: + add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs); + } + return conv_ptrs; + } +}; + +template , + typename WeightsInitFun = FillUniform> +class ConvFwdOpInstance : public ck::utils::OpInstance +{ + using DeviceConvFwdOp = tensor_operation::device:: + DeviceConvFwd; + using DeviceMemPtr = std::unique_ptr; + using DeviceBuffers = std::vector; + using BaseType = ck::utils::OpInstance; + template + using TensorPtr = std::unique_ptr>; + using InTensorsTuple = std::tuple, TensorPtr>; + + public: + ConvFwdOpInstance() = delete; + ConvFwdOpInstance(const ConvFwdOpInstance&) = default; + ConvFwdOpInstance& operator=(const ConvFwdOpInstance&) = default; + + ConvFwdOpInstance(const ConvParams& params, + bool do_init = true, + const InputInitFun& input_init_f = InputInitFun{}, + const WeightsInitFun& weights_init_f = WeightsInitFun{}) + : BaseType(), + params_{params}, + output_spatial_lengths_{params.GetOutputSpatialLengths()}, + do_init_{do_init}, + input_init_f_{input_init_f}, + weights_init_f_{weights_init_f} + { + } + + virtual ~ConvFwdOpInstance() override{}; + + virtual InTensorsTuple GetInputTensors() const override + { + std::vector input_dims{static_cast(params_.N), + static_cast(params_.C)}; + input_dims.insert(std::end(input_dims), + std::begin(params_.input_spatial_lengths), + std::end(params_.input_spatial_lengths)); + + std::vector filter_dims{static_cast(params_.K), + static_cast(params_.C)}; + filter_dims.insert(std::end(filter_dims), + std::begin(params_.filter_spatial_lengths), + std::end(params_.filter_spatial_lengths)); + + auto input = std::make_unique>( + get_host_tensor_descriptor(input_dims, InLayout{})); + auto weights = std::make_unique>( + get_host_tensor_descriptor(filter_dims, WeiLayout{})); + + if(do_init_) + { + input_init_f_(input->begin(), input->end()); + weights_init_f_(weights->begin(), weights->end()); + } + + return std::make_tuple(std::move(input), std::move(weights)); + } + + virtual TensorPtr GetOutputTensor() const override + { + std::vector output_dims{static_cast(params_.N), + static_cast(params_.K)}; + output_dims.insert(std::end(output_dims), + std::begin(output_spatial_lengths_), + std::end(output_spatial_lengths_)); + auto output = std::make_unique>( + get_host_tensor_descriptor(output_dims, OutLayout{})); + + if(do_init_) + { + std::fill(output->begin(), output->end(), OutDataType(0.f)); + } + return output; + } + + virtual std::unique_ptr + MakeInvokerPointer(tensor_operation::device::BaseOperator* op_ptr) const override + { + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + + auto conv_ptr = dynamic_cast(op_ptr); + if(!conv_ptr) + { + throw std::runtime_error( + "[ConvFwdOpInstance]: couldn't cast op_ptr to DeviceConvFwdNoOpPtr type!"); + } + return conv_ptr->MakeInvokerPointer(); + } + + virtual std::unique_ptr + MakeArgumentPointer(tensor_operation::device::BaseOperator* op_ptr, + const DeviceBuffers& in_device_buffers, + const DeviceMemPtr& out_device_buffer) const override + { + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + + auto conv_ptr = dynamic_cast(op_ptr); + if(!conv_ptr) + { + throw std::runtime_error( + "[ConvFwdOpInstance]: couldn't cast op_ptr to DeviceConvFwdNoOpPtr type!"); + } + + return conv_ptr->MakeArgumentPointer( + static_cast(in_device_buffers[0]->GetDeviceBuffer()), + static_cast(in_device_buffers[1]->GetDeviceBuffer()), + static_cast(out_device_buffer->GetDeviceBuffer()), + params_.N, + params_.K, + params_.C, + params_.input_spatial_lengths, + params_.filter_spatial_lengths, + output_spatial_lengths_, + params_.conv_filter_strides, + params_.conv_filter_dilations, + params_.input_left_pads, + params_.input_right_pads, + InElementwiseOp{}, + WeiElementwiseOp{}, + OutElementwiseOp{}); + } + + virtual std::size_t GetFlops() const override + { + return get_flops(params_.N, + params_.C, + params_.K, + params_.filter_spatial_lengths, + output_spatial_lengths_); + } + + virtual std::size_t GetBtype() const override + { + return get_btype(params_.N, + params_.C, + params_.K, + params_.input_spatial_lengths, + params_.filter_spatial_lengths, + output_spatial_lengths_); + } + + private: + const ConvParams& params_; + const std::vector output_spatial_lengths_; + const bool do_init_; + const InputInitFun& input_init_f_; + const WeightsInitFun& weights_init_f_; +}; } // namespace conv } // namespace utils } // namespace ck -#endif +std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParams& p); diff --git a/library/include/ck/library/utility/fill.hpp b/library/include/ck/library/utility/fill.hpp new file mode 100644 index 0000000000..f44aec969d --- /dev/null +++ b/library/include/ck/library/utility/fill.hpp @@ -0,0 +1,81 @@ +#pragma once + +#include +#include + +#include "data_type.hpp" + +namespace ck { +namespace utils { + +// template +// struct FillUniform; + +// TODO: what's wrong with this specialization??? +// err: segmentation fault in mt19937 - infinite loop like. +// template +// struct FillUniform::value && +// !std::is_same::value>::type> +// { +// int a_{0}; +// int b_{5}; +// // T a_ = T{0}; +// // T b_ = T{5}; + +// template +// void operator()(ForwardIter first, ForwardIter last) const +// { +// std::mt19937 gen{11939}; +// std::uniform_int_distribution dis(a_, b_); +// std::generate(first, last, [&dis, &gen]() { return ck::type_convert(dis(gen)); }); +// } +// }; + +// struct FillUniform::value || +// std::is_same::value>::type> +template +struct FillUniform +{ + float a_{0}; + float b_{5}; + + template + void operator()(ForwardIter first, ForwardIter last) const + { + std::mt19937 gen{11939}; + std::uniform_real_distribution<> dis(a_, b_); + std::generate(first, last, [&dis, &gen]() { return ck::type_convert(dis(gen)); }); + } +}; + +template +struct FillMonotonicSeq +{ + T init_value_{0}; + T step_{1}; + + template + void operator()(ForwardIter first, ForwardIter last) const + { + std::generate(first, last, [=, n = init_value_]() mutable { + auto tmp = n; + n += step_; + return tmp; + }); + } +}; + +template +struct FillConstant +{ + T value_{0}; + + template + void operator()(ForwardIter first, ForwardIter last) const + { + std::fill(first, last, value_); + } +}; + +} // namespace utils +} // namespace ck diff --git a/library/include/ck/library/utility/op_instance_engine.hpp b/library/include/ck/library/utility/op_instance_engine.hpp new file mode 100644 index 0000000000..ec88b4e1b9 --- /dev/null +++ b/library/include/ck/library/utility/op_instance_engine.hpp @@ -0,0 +1,231 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "check_err.hpp" +#include "device_base.hpp" +#include "functional2.hpp" + +namespace ck { +namespace utils { + +struct ProfileBestConfig +{ + std::string best_op_name; + float best_avg_time = std::numeric_limits::max(); + float best_tflops = std::numeric_limits::max(); + float best_gb_per_sec = std::numeric_limits::max(); +}; + +/** + * @brief This class describes an operation instance(s). + * + * Op instance defines a particular specializations of operator + * template. Thanks to this specific input/output data types, data + * layouts and modifying elementwise operations it is able to create + * it's input/output tensors, provide pointers to instances which + * can execute it and all operation specific parameters. + */ +template +class OpInstance +{ + public: + template + using TensorPtr = std::unique_ptr>; + using InTensorsTuple = std::tuple...>; + using DeviceMemPtr = std::unique_ptr; + using DeviceBuffers = std::vector; + + OpInstance() = default; + OpInstance(const OpInstance&) = default; + OpInstance& operator=(const OpInstance&) = default; + virtual ~OpInstance(){}; + + virtual InTensorsTuple GetInputTensors() const = 0; + virtual TensorPtr GetOutputTensor() const = 0; + virtual std::unique_ptr + MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0; + virtual std::unique_ptr + MakeArgumentPointer(tensor_operation::device::BaseOperator*, + const DeviceBuffers&, + const DeviceMemPtr&) const = 0; + virtual std::size_t GetFlops() const = 0; + virtual std::size_t GetBtype() const = 0; +}; + +/** + * @brief A generic operation instance run engine. + */ +template +class OpInstanceRunEngine +{ + public: + using OpInstanceT = OpInstance; + template + using TensorPtr = std::unique_ptr>; + using DeviceMemPtr = std::unique_ptr; + using InTensorsTuple = std::tuple...>; + using DeviceBuffers = std::vector; + using InArgsTypesTuple = std::tuple; + + OpInstanceRunEngine() = delete; + + template > + OpInstanceRunEngine(const OpInstanceT& op_instance, + const ReferenceOp& reference_op = ReferenceOp{}) + : op_instance_{op_instance} + { + in_tensors_ = op_instance_.GetInputTensors(); + out_tensor_ = op_instance_.GetOutputTensor(); + + if constexpr(std::is_invocable_v&..., + Tensor&>) + { + ref_output_ = op_instance_.GetOutputTensor(); + CallRefOpUnpackArgs(reference_op, std::make_index_sequence{}); + } + AllocateDeviceInputTensors(std::make_index_sequence{}); + out_device_buffer_ = + std::make_unique(sizeof(OutDataType) * out_tensor_->mDesc.GetElementSpace()); + out_device_buffer_->SetZero(); + } + + virtual ~OpInstanceRunEngine(){}; + + template + bool Test(const std::vector& op_ptrs) + { + bool res{true}; + for(auto& op_ptr : op_ptrs) + { + auto invoker = op_instance_.MakeInvokerPointer(op_ptr.get()); + auto argument = op_instance_.MakeArgumentPointer( + op_ptr.get(), in_device_buffers_, out_device_buffer_); + if(op_ptr->IsSupportedArgument(argument.get())) + { + invoker->Run(argument.get()); + out_device_buffer_->FromDevice(out_tensor_->mData.data()); + if(!ref_output_) + { + throw std::runtime_error( + "OpInstanceRunEngine::Test: Reference value not availabe." + " You have to provide reference function."); + } + // TODO: enable flexible use of custom check_error functions + res = res && check_err(out_tensor_->mData, ref_output_->mData); + out_device_buffer_->SetZero(); + } + } + return res; + } + + template + ProfileBestConfig Profile(const std::vector& op_ptrs, + int nrepeat = 100, + bool do_verification = false, + bool do_log = false) + { + bool res{true}; + ProfileBestConfig best_config; + + for(auto& op_ptr : op_ptrs) + { + auto invoker = op_instance_.MakeInvokerPointer(op_ptr.get()); + auto argument = op_instance_.MakeArgumentPointer( + op_ptr.get(), in_device_buffers_, out_device_buffer_); + if(op_ptr->IsSupportedArgument(argument.get())) + { + std::string op_name = op_ptr->GetTypeString(); + float avg_time = invoker->Run(argument.get(), nrepeat); + + std::size_t flops = op_instance_.GetFlops(); + std::size_t num_btype = op_instance_.GetBtype(); + float tflops = static_cast(flops) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << op_name << std::endl; + + if(tflops < best_config.best_tflops) + { + best_config.best_op_name = op_name; + best_config.best_tflops = tflops; + best_config.best_gb_per_sec = gb_per_sec; + best_config.best_avg_time = avg_time; + } + + if(do_verification) + { + out_device_buffer_->FromDevice(out_tensor_->mData.data()); + if(!ref_output_) + { + throw std::runtime_error( + "OpInstanceRunEngine::Profile: Reference value not availabe." + " You have to provide reference function."); + } + // TODO: enable flexible use of custom check_error functions + res = res && CheckErr(out_tensor_->mData, ref_output_->mData); + + if(do_log) {} + } + out_device_buffer_->SetZero(); + } + } + return best_config; + } + + void SetAtol(double a) { atol_ = a; } + void SetRtol(double r) { rtol_ = r; } + + private: + template + void CallRefOpUnpackArgs(const F& f, std::index_sequence) const + { + f(*std::get(in_tensors_)..., *ref_output_); + } + + template + void AllocateDeviceInputTensors(std::index_sequence) + { + (AllocateDeviceInputTensorsImpl(), ...); + } + + template + void AllocateDeviceInputTensorsImpl() + { + const auto& ts = std::get(in_tensors_); + in_device_buffers_ + .emplace_back( + std::make_unique(sizeof(std::tuple_element_t) * + ts->mDesc.GetElementSpace())) + ->ToDevice(ts->mData.data()); + } + + static constexpr std::size_t kNInArgs_ = std::tuple_size_v; + const OpInstanceT& op_instance_; + double rtol_{1e-5}; + double atol_{1e-8}; + + InTensorsTuple in_tensors_; + TensorPtr out_tensor_; + TensorPtr ref_output_; + + DeviceBuffers in_device_buffers_; + DeviceMemPtr out_device_buffer_; + + template + bool CheckErr(const std::vector& dev_out, const std::vector& ref_out) const + { + return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_); + } +}; + +} // namespace utils +} // namespace ck diff --git a/library/src/utility/CMakeLists.txt b/library/src/utility/CMakeLists.txt new file mode 100644 index 0000000000..3580ba1a8f --- /dev/null +++ b/library/src/utility/CMakeLists.txt @@ -0,0 +1,21 @@ +include_directories(BEFORE + ${PROJECT_SOURCE_DIR}/include/ck + ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/device + ${PROJECT_SOURCE_DIR}/include/ck/tensor_operation/gpu/element + ${PROJECT_SOURCE_DIR}/include/ck/utility + ${PROJECT_SOURCE_DIR}/library/include/ck/library/host_tensor + ${PROJECT_SOURCE_DIR}/library/include/ck/library/reference_tensor_operation/cpu + ${PROJECT_SOURCE_DIR}/library/include/ck/library/utility +) + +set(CONV_FWD_UTIL_SOURCE + conv_fwd_util.cpp +) + +add_library(conv_fwd_util SHARED ${CONV_FWD_UTIL_SOURCE}) +target_link_libraries(conv_fwd_util PRIVATE host_tensor) +target_compile_features(conv_fwd_util PUBLIC) +set_target_properties(conv_fwd_util PROPERTIES POSITION_INDEPENDENT_CODE ON) +target_include_directories(conv_fwd_util SYSTEM PUBLIC $) + +clang_tidy_check(conv_fwd_util) diff --git a/library/src/utility/conv_fwd_util.cpp b/library/src/utility/conv_fwd_util.cpp new file mode 100644 index 0000000000..fde2caa56b --- /dev/null +++ b/library/src/utility/conv_fwd_util.cpp @@ -0,0 +1,238 @@ + +#include "conv_fwd_util.hpp" + +namespace ck { +namespace utils { +namespace conv { + +/** + * @brief Calculate number of FLOPs for Convolution + * + * @param[in] N Batch size. + * @param[in] C Number of input channels. + * @param[in] K Number of output channels. + * @param[in] filter_spatial_lengths Filter spatial dimensions lengths. + * @param[in] output_spatial_lengths Convolution output spatial dimensions + * lengths. + * + * @return The number of flops. + */ +std::size_t get_flops(ck::index_t N, + ck::index_t C, + ck::index_t K, + const std::vector& filter_spatial_lengths, + const std::vector& output_spatial_lengths) +{ + // 2 * N * K * * C * + return static_cast(2) * N * K * + std::accumulate(std::begin(output_spatial_lengths), + std::end(output_spatial_lengths), + static_cast(1), + std::multiplies()) * + C * + std::accumulate(std::begin(filter_spatial_lengths), + std::end(filter_spatial_lengths), + static_cast(1), + std::multiplies()); +} + +ConvParams::ConvParams() + : num_dim_spatial(2), + N(128), + K(256), + C(192), + filter_spatial_lengths(2, 3), + input_spatial_lengths(2, 71), + conv_filter_strides(2, 2), + conv_filter_dilations(2, 1), + input_left_pads(2, 1), + input_right_pads(2, 1) +{ +} + +ConvParams::ConvParams(ck::index_t n_dim, + ck::index_t n_batch, + ck::index_t n_out_channels, + ck::index_t n_in_channels, + const std::vector& filters_len, + const std::vector& input_len, + const std::vector& strides, + const std::vector& dilations, + const std::vector& left_pads, + const std::vector& right_pads) + : num_dim_spatial(n_dim), + N(n_batch), + K(n_out_channels), + C(n_in_channels), + filter_spatial_lengths(filters_len), + input_spatial_lengths(input_len), + conv_filter_strides(strides), + conv_filter_dilations(dilations), + input_left_pads(left_pads), + input_right_pads(right_pads) +{ + if(filter_spatial_lengths.size() != num_dim_spatial || + input_spatial_lengths.size() != num_dim_spatial || + conv_filter_strides.size() != num_dim_spatial || + conv_filter_dilations.size() != num_dim_spatial || + input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) + { + throw(std::runtime_error( + "ConvParams::GetOutputSpatialLengths: " + "parameter size is different from number of declared dimensions!")); + } +} + +std::vector ConvParams::GetOutputSpatialLengths() const +{ + if(filter_spatial_lengths.size() != num_dim_spatial || + input_spatial_lengths.size() != num_dim_spatial || + conv_filter_strides.size() != num_dim_spatial || + conv_filter_dilations.size() != num_dim_spatial || + input_left_pads.size() != num_dim_spatial || input_right_pads.size() != num_dim_spatial) + { + throw(std::runtime_error( + "ConvParams::GetOutputSpatialLengths: " + "parameter size is different from number of declared dimensions!")); + } + + std::vector out_spatial_len(num_dim_spatial, 0); + for(ck::index_t i = 0; i < num_dim_spatial; ++i) + { + // XEff = (X - 1) * conv_dilation_w + 1; + // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + const ck::index_t idx_eff = + (filter_spatial_lengths[i] - 1) * conv_filter_dilations[i] + 1; + out_spatial_len[i] = + (input_spatial_lengths[i] + input_left_pads[i] + input_right_pads[i] - idx_eff) / + conv_filter_strides[i] + + 1; + } + return out_spatial_len; +} + +ConvParams parse_conv_params(int num_dim_spatial, int arg_idx, char* const argv[]) +{ + ck::utils::conv::ConvParams params; + + params.num_dim_spatial = num_dim_spatial; + params.N = std::stoi(argv[arg_idx++]); + params.K = std::stoi(argv[arg_idx++]); + params.C = std::stoi(argv[arg_idx++]); + + params.filter_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.filter_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.input_spatial_lengths.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_spatial_lengths[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_strides.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_strides[i] = std::stoi(argv[arg_idx++]); + } + params.conv_filter_dilations.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.conv_filter_dilations[i] = std::stoi(argv[arg_idx++]); + } + params.input_left_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_left_pads[i] = std::stoi(argv[arg_idx++]); + } + params.input_right_pads.resize(num_dim_spatial); + for(int i = 0; i < num_dim_spatial; ++i) + { + params.input_right_pads[i] = std::stoi(argv[arg_idx++]); + } + + return params; +} + +HostTensorDescriptor get_output_host_tensor_descriptor(const std::vector& dims, + int num_dim_spatial) +{ + namespace tl = ck::tensor_layout::convolution; + + switch(num_dim_spatial) + { + case 3: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWK{}); + } + case 2: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWK{}); + } + case 1: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWK{}); + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } +} + +HostTensorDescriptor get_filters_host_tensor_descriptor(const std::vector& dims, + int num_dim_spatial) +{ + namespace tl = ck::tensor_layout::convolution; + + switch(num_dim_spatial) + { + case 3: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KZYXC{}); + } + case 2: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KYXC{}); + } + case 1: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::KXC{}); + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } +} + +HostTensorDescriptor get_input_host_tensor_descriptor(const std::vector& dims, + int num_dim_spatial) +{ + namespace tl = ck::tensor_layout::convolution; + + switch(num_dim_spatial) + { + case 3: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NDHWC{}); + } + case 2: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NHWC{}); + } + case 1: { + return ck::utils::conv::get_host_tensor_descriptor(dims, tl::NWC{}); + } + default: { + throw std::runtime_error("Unsupported number of spatial dimensions provided!"); + } + } +} + +} // namespace conv +} // namespace utils +} // namespace ck + +std::ostream& operator<<(std::ostream& os, const ck::utils::conv::ConvParams& p) +{ + os << "ConvParams {" + << "\nnum_dim_spatial: " << p.num_dim_spatial << "\nN: " << p.N << "\nK: " << p.K + << "\nC: " << p.C << "\nfilter_spatial_lengths: " << p.filter_spatial_lengths + << "\ninput_spatial_lengths: " << p.input_spatial_lengths + << "\nconv_filter_strides: " << p.conv_filter_strides + << "\nconv_filter_dilations: " << p.conv_filter_dilations + << "\ninput_left_pads: " << p.input_left_pads + << "\ninput_right_pads: " << p.input_right_pads; + return os; +} diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index a2cf6eeb62..dd8ebe306d 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -29,10 +29,10 @@ set(PROFILER_SOURCE src/profile_gemm_bias_relu_add.cpp src/profile_gemm_reduce.cpp src/profile_batched_gemm.cpp - src/profile_conv_fwd.cpp src/profile_conv_fwd_bias_relu.cpp src/profile_conv_fwd_bias_relu_add.cpp src/profile_conv_fwd_bias_relu_atomic_add.cpp + src/profile_convnd_fwd.cpp src/profile_convnd_bwd_data.cpp src/profile_reduce.cpp src/profile_grouped_gemm.cpp @@ -43,19 +43,21 @@ set(PROFILER_SOURCE add_executable(ckProfiler ${PROFILER_SOURCE}) target_link_libraries(ckProfiler PRIVATE host_tensor) +target_link_libraries(ckProfiler PRIVATE conv_fwd_util) target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_batched_gemm_instance) +target_link_libraries(ckProfiler PRIVATE device_conv1d_fwd_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance) +target_link_libraries(ckProfiler PRIVATE device_conv3d_fwd_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance) target_link_libraries(ckProfiler PRIVATE device_convnd_bwd_data_instance) target_link_libraries(ckProfiler PRIVATE device_reduce_instance) -target_link_libraries(ckProfiler PRIVATE device_reduce_instance) target_link_libraries(ckProfiler PRIVATE device_grouped_gemm_instance) target_link_libraries(ckProfiler PRIVATE device_conv2d_bwd_weight_instance) target_link_libraries(ckProfiler PRIVATE device_batched_gemm_reduce_instance) diff --git a/profiler/include/profile_conv_fwd_impl.hpp b/profiler/include/profile_conv_fwd_impl.hpp deleted file mode 100644 index 6038cd4612..0000000000 --- a/profiler/include/profile_conv_fwd_impl.hpp +++ /dev/null @@ -1,283 +0,0 @@ -#pragma once - -#include "check_err.hpp" -#include "config.hpp" -#include "device.hpp" -#include "host_tensor.hpp" -#include "host_tensor_generator.hpp" -#include "tensor_layout.hpp" -#include "device_tensor.hpp" -#include "device_conv_fwd.hpp" -#include "element_wise_operation.hpp" -#include "reference_conv_fwd.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_conv2d_fwd_instance { - -using DeviceConvFwdNoOpPtr = DeviceConvFwdPtr; - -void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector&); - -void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector&); - -void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( - std::vector&); - -void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector&); - -void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector&); -} // namespace device_conv2d_fwd_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - -namespace ck { -namespace profiler { - -template -void profile_conv_fwd_impl(int do_verification, - int init_method, - bool do_log, - int nrepeat, - ck::index_t N, - ck::index_t K, - ck::index_t C, - std::vector input_spatial_lengths, - std::vector filter_spatial_lengths, - std::vector output_spatial_lengths, - std::vector conv_filter_strides, - std::vector conv_filter_dilations, - std::vector input_left_pads, - std::vector input_right_pads) -{ - const ck::index_t Y = filter_spatial_lengths[0]; - const ck::index_t X = filter_spatial_lengths[1]; - - const ck::index_t Hi = input_spatial_lengths[0]; - const ck::index_t Wi = input_spatial_lengths[1]; - - const ck::index_t Ho = output_spatial_lengths[0]; - const ck::index_t Wo = output_spatial_lengths[1]; - - auto f_host_tensor_descriptor = - [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) { - if constexpr(is_same::value || - is_same::value || - is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, H * W, W, 1})); - } - else if constexpr(is_same::value || - is_same::value || - is_same::value) - { - return HostTensorDescriptor(std::vector({N_, C_, H, W}), - std::vector({C_ * H * W, 1, W * C_, C_})); - } - }; - - Tensor in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{})); - Tensor wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{})); - Tensor out_n_k_ho_wo_host_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); - Tensor out_n_k_ho_wo_device_result( - f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{})); - - std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl; - std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl; - std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl; - - switch(init_method) - { - case 0: break; - case 1: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - break; - default: - in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); - } - - using InElementOp = ck::tensor_operation::element_wise::PassThrough; - using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; - using OutElementOp = ck::tensor_operation::element_wise::PassThrough; - - const auto in_element_op = InElementOp{}; - const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{}; - - if(do_verification) - { - using ReferenceConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd; - - auto ref_conv = ReferenceConvFwdInstance{}; - auto ref_invoker = ref_conv.MakeInvoker(); - auto ref_argument = ref_conv.MakeArgument(in_n_c_hi_wi, - wei_k_c_y_x, - out_n_k_ho_wo_host_result, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - in_element_op, - wei_element_op, - out_element_op); - - ref_invoker.Run(ref_argument); - } - - DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace()); - DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace()); - DeviceMem out_device_buf(sizeof(OutDataType) * - out_n_k_ho_wo_device_result.mDesc.GetElementSpace()); - - in_device_buf.ToDevice(in_n_c_hi_wi.mData.data()); - wei_device_buf.ToDevice(wei_k_c_y_x.mData.data()); - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - using DeviceConvFwdNoOpPtr = - ck::tensor_operation::device::DeviceConvFwdPtr; - - // add device Conv instances - std::vector conv_ptrs; - - if constexpr(ck::is_same_v, float> && - ck::is_same_v, float> && - ck::is_same_v, float>) - { - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); - } - else if constexpr(ck::is_same_v, ck::half_t> && - ck::is_same_v, ck::half_t> && - ck::is_same_v, ck::half_t>) - { - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); - - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); - } - else if constexpr(ck::is_same_v, bhalf_t> && - ck::is_same_v, bhalf_t> && - ck::is_same_v, bhalf_t>) - { - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); - } - else if constexpr(ck::is_same_v, int8_t> && - ck::is_same_v, int8_t> && - ck::is_same_v, int8_t>) - { - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); - } - - if(conv_ptrs.size() <= 0) - { - throw std::runtime_error("wrong! no device Conv instance found"); - } - - std::string best_conv_name; - float best_ave_time = 0; - float best_tflops = 0; - float best_gb_per_sec = 0; - - // profile device Conv instances - for(auto& conv_ptr : conv_ptrs) - { - auto argument_ptr = conv_ptr->MakeArgumentPointer( - static_cast(in_device_buf.GetDeviceBuffer()), - static_cast(wei_device_buf.GetDeviceBuffer()), - static_cast(out_device_buf.GetDeviceBuffer()), - N, - K, - C, - input_spatial_lengths, - filter_spatial_lengths, - output_spatial_lengths, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - in_element_op, - wei_element_op, - out_element_op); - - auto invoker_ptr = conv_ptr->MakeInvokerPointer(); - - if(conv_ptr->IsSupportedArgument(argument_ptr.get())) - { - std::string conv_name = conv_ptr->GetTypeString(); - - float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat); - - std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X; - - std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) + - sizeof(WeiDataType) * (K * C * Y * X) + - sizeof(OutDataType) * (N * K * Ho * Wo); - - float tflops = static_cast(flop) / 1.E9 / ave_time; - - float gb_per_sec = num_btype / 1.E6 / ave_time; - - std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec - << " GB/s, " << conv_name << std::endl; - - if(tflops > best_tflops) - { - best_conv_name = conv_name; - best_tflops = tflops; - best_ave_time = ave_time; - best_gb_per_sec = gb_per_sec; - } - - if(do_verification) - { - out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data()); - - ck::utils::check_err(out_n_k_ho_wo_device_result.mData, - out_n_k_ho_wo_host_result.mData); - - if(do_log) - { - LogRangeAsType(std::cout << "in : ", in_n_c_hi_wi.mData, ",") - << std::endl; - LogRangeAsType(std::cout << "wei: ", wei_k_c_y_x.mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "out_host : ", out_n_k_ho_wo_host_result.mData, ",") - << std::endl; - LogRangeAsType( - std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",") - << std::endl; - } - } - } - } - - std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, " - << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl; -} - -} // namespace profiler -} // namespace ck diff --git a/profiler/include/profile_convnd_fwd.hpp b/profiler/include/profile_convnd_fwd.hpp new file mode 100644 index 0000000000..a3b55a79d1 --- /dev/null +++ b/profiler/include/profile_convnd_fwd.hpp @@ -0,0 +1,9 @@ +#pragma once + +namespace ck { +namespace profiler { + +int profile_convnd_fwd(int argc, char* argv[]); + +} // namespace profiler +} // namespace ck diff --git a/profiler/src/profile_conv_fwd.cpp b/profiler/src/profile_conv_fwd.cpp deleted file mode 100644 index 3d4aa358f2..0000000000 --- a/profiler/src/profile_conv_fwd.cpp +++ /dev/null @@ -1,191 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "profile_conv_fwd_impl.hpp" - -enum struct ConvDataType -{ - F32_F32_F32, // 0 - F16_F16_F16, // 1 - BF16_BF16_BF16, // 2 - INT8_INT8_INT8, // 3 -}; - -enum struct ConvInputLayout -{ - NCHW, // 0 - NHWC, // 1 -}; - -enum struct ConvWeightLayout -{ - KCYX, // 0 - KYXC, // 1 -}; - -enum struct ConvOutputLayout -{ - NKHW, // 0 - NHWK, // 1 -}; - -int profile_conv_fwd(int argc, char* argv[]) -{ - if(argc != 25) - { - printf("arg1: tensor operation (conv_fwd: ForwardConvolution)\n"); - printf("arg2: data type (0: fp32; 1: fp16)\n"); - printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n"); - printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n"); - printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n"); - printf("arg6: verification (0: no; 1: yes)\n"); - printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n"); - printf("arg8: print tensor value (0: no; 1: yes)\n"); - printf("arg9: run kernel # of times (>1)\n"); - printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, " - "RightPx\n"); - exit(1); - } - - const auto data_type = static_cast(std::stoi(argv[2])); - const auto in_layout = static_cast(std::stoi(argv[3])); - const auto wei_layout = static_cast(std::stoi(argv[4])); - const auto out_layout = static_cast(std::stoi(argv[5])); - const bool do_verification = std::stoi(argv[6]); - const int init_method = std::stoi(argv[7]); - const bool do_log = std::stoi(argv[8]); - const int nrepeat = std::stoi(argv[9]); - - const ck::index_t N = std::stoi(argv[10]); - const ck::index_t K = std::stoi(argv[11]); - const ck::index_t C = std::stoi(argv[12]); - const ck::index_t Y = std::stoi(argv[13]); - const ck::index_t X = std::stoi(argv[14]); - const ck::index_t Hi = std::stoi(argv[15]); - const ck::index_t Wi = std::stoi(argv[16]); - - const ck::index_t conv_stride_h = std::stoi(argv[17]); - const ck::index_t conv_stride_w = std::stoi(argv[18]); - const ck::index_t conv_dilation_h = std::stoi(argv[19]); - const ck::index_t conv_dilation_w = std::stoi(argv[20]); - const ck::index_t in_left_pad_h = std::stoi(argv[21]); - const ck::index_t in_left_pad_w = std::stoi(argv[22]); - const ck::index_t in_right_pad_h = std::stoi(argv[23]); - const ck::index_t in_right_pad_w = std::stoi(argv[24]); - - const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1; - const ck::index_t XEff = (X - 1) * conv_dilation_w + 1; - - const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1; - const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; - - if(data_type == ConvDataType::F32_F32_F32 && in_layout == ConvInputLayout::NHWC && - wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK) - { - ck::profiler::profile_conv_fwd_impl<2, - float, - float, - float, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK>( - do_verification, - init_method, - do_log, - nrepeat, - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - std::vector{conv_stride_h, conv_stride_w}, - std::vector{conv_dilation_h, conv_dilation_w}, - std::vector{in_left_pad_h, in_left_pad_w}, - std::vector{in_right_pad_h, in_right_pad_w}); - } - else if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC && - wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK) - { - ck::profiler::profile_conv_fwd_impl<2, - ck::half_t, - ck::half_t, - ck::half_t, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK>( - do_verification, - init_method, - do_log, - nrepeat, - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - std::vector{conv_stride_h, conv_stride_w}, - std::vector{conv_dilation_h, conv_dilation_w}, - std::vector{in_left_pad_h, in_left_pad_w}, - std::vector{in_right_pad_h, in_right_pad_w}); - } - else if(data_type == ConvDataType::BF16_BF16_BF16 && in_layout == ConvInputLayout::NHWC && - wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK) - { - ck::profiler::profile_conv_fwd_impl<2, - uint16_t, - uint16_t, - uint16_t, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK>( - do_verification, - init_method, - do_log, - nrepeat, - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - std::vector{conv_stride_h, conv_stride_w}, - std::vector{conv_dilation_h, conv_dilation_w}, - std::vector{in_left_pad_h, in_left_pad_w}, - std::vector{in_right_pad_h, in_right_pad_w}); - } - else if(data_type == ConvDataType::INT8_INT8_INT8 && in_layout == ConvInputLayout::NHWC && - wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK) - { - ck::profiler::profile_conv_fwd_impl<2, - int8_t, - int8_t, - int8_t, - ck::tensor_layout::convolution::NHWC, - ck::tensor_layout::convolution::KYXC, - ck::tensor_layout::convolution::NHWK>( - do_verification, - init_method, - do_log, - nrepeat, - N, - K, - C, - std::vector{Hi, Wi}, - std::vector{Y, X}, - std::vector{Ho, Wo}, - std::vector{conv_stride_h, conv_stride_w}, - std::vector{conv_dilation_h, conv_dilation_w}, - std::vector{in_left_pad_h, in_left_pad_w}, - std::vector{in_right_pad_h, in_right_pad_w}); - } - else - { - throw std::runtime_error("wrong! this Conv data_type & layout is not implemented"); - } - - return 1; -} diff --git a/profiler/src/profile_convnd_bwd_data.cpp b/profiler/src/profile_convnd_bwd_data.cpp index 9de9170b57..893fb8c791 100644 --- a/profiler/src/profile_convnd_bwd_data.cpp +++ b/profiler/src/profile_convnd_bwd_data.cpp @@ -7,6 +7,8 @@ #include "profile_convnd_bwd_data_impl.hpp" +namespace { + enum struct ConvDataType { F32_F32_F32, // 0 @@ -76,6 +78,8 @@ ck::utils::conv::ConvParams parse_conv_params(int num_dim_spatial, char* argv[], return params; } +} // namespace + int profile_convnd_bwd_data(int argc, char* argv[], int num_dim_spatial) { const int preParams = 10; diff --git a/profiler/src/profile_convnd_fwd.cpp b/profiler/src/profile_convnd_fwd.cpp new file mode 100644 index 0000000000..1abd73c729 --- /dev/null +++ b/profiler/src/profile_convnd_fwd.cpp @@ -0,0 +1,351 @@ +#include +#include +#include +#include +#include +#include + +#include "conv_fwd_util.hpp" +#include "element_wise_operation.hpp" +#include "fill.hpp" +#include "profile_convnd_fwd.hpp" +#include "tensor_layout.hpp" + +namespace { + +enum struct ConvDataType +{ + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 + INT8_INT8_INT8, // 3 +}; + +enum struct ConvDataLayout +{ + NCHW, // 0 + NHWC, // 1 +}; + +namespace ctl = ck::tensor_layout::convolution; + +template +struct ConvolutionLayouts; + +template <> +struct ConvolutionLayouts<1, ConvDataLayout::NHWC> +{ + typedef ctl::NWC Input; + typedef ctl::KXC Weight; + typedef ctl::NWK Output; +}; +template <> +struct ConvolutionLayouts<2, ConvDataLayout::NHWC> +{ + typedef ctl::NHWC Input; + typedef ctl::KYXC Weight; + typedef ctl::NHWK Output; +}; +template <> +struct ConvolutionLayouts<3, ConvDataLayout::NHWC> +{ + typedef ctl::NDHWC Input; + typedef ctl::KZYXC Weight; + typedef ctl::NDHWK Output; +}; +template <> +struct ConvolutionLayouts<1, ConvDataLayout::NCHW> +{ + typedef ctl::NCW Input; + typedef ctl::KCX Weight; + typedef ctl::NKW Output; +}; +template <> +struct ConvolutionLayouts<2, ConvDataLayout::NCHW> +{ + typedef ctl::NCHW Input; + typedef ctl::KCYX Weight; + typedef ctl::NKHW Output; +}; +template <> +struct ConvolutionLayouts<3, ConvDataLayout::NCHW> +{ + typedef ctl::NCDHW Input; + typedef ctl::KCZYX Weight; + typedef ctl::NKDHW Output; +}; + +void print_use_msg() +{ + std::cout << "arg1: tensor operation (conv_fwd: ForwardConvolution)\n" + << "arg2: data type (0: fp32; 1: fp16, 2: bf16, 3: int8)\n" + << "arg3: data layout (0: NCHW; 1: NHWC)\n" + << "arg4: verification (0=no, 1=yes)\n" + << "arg5: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg6: print tensor value (0: no; 1: yes)\n" + << "arg7: run kernel # of times (>1)\n" + << "arg8: N spatial dimensions (default 2)\n" + << "Following arguments (depending on number of spatial dims):\n" + << " N, K, C, \n" + << " , (ie Y, X for 2D)\n" + << " , (ie Hi, Wi for 2D)\n" + << " , (ie Sy, Sx for 2D)\n" + << " , (ie Dy, Dx for 2D)\n" + << " , (ie LeftPy, LeftPx for 2D)\n" + << " , (ie RightPy, RightPx for 2D)\n" + << std::endl; +} + +ck::utils::conv::ConvParams parse_params(int num_dim_spatial, int argc, char* argv[]) +{ + // (N, K, C) + num_dim_spatial * 6 (filter, input, strides, dilations, pad left, pad right) + int conv_args = 3 + num_dim_spatial * 6; + int cmdline_nargs = conv_args + 9; + if(cmdline_nargs != argc) + { + print_use_msg(); + exit(1); + } + int arg_idx = 9; + + return ck::utils::conv::parse_conv_params(num_dim_spatial, arg_idx, argv); +} + +template +void profile_convnd_instances_impl(const ck::utils::conv::ConvParams& params, + bool do_verification, + bool do_log, + int nrepeat, + int init_method, + ConvLayouts) +{ + using namespace std::placeholders; + using namespace ck::utils; + + std::unique_ptr> conv_instance; + + switch(init_method) + { + case 0: + conv_instance = + std::make_unique>(params, false); + break; + case 1: + conv_instance = std::make_unique< + conv::ConvFwdOpInstance, + ck::utils::FillUniform>>( + params, true, ck::utils::FillUniform{}, ck::utils::FillUniform{}); + break; + case 2: + conv_instance = std::make_unique< + conv::ConvFwdOpInstance, + ck::utils::FillUniform>>( + params, + true, + ck::utils::FillUniform{}, + ck::utils::FillUniform{}); + break; + default: throw std::runtime_error("Unsupported init method!"); + } + + auto reference_conv_fwd_fun = std::bind( + conv::run_reference_convolution_forward, + params, + _1, + _2, + _3); + OpInstanceRunEngine run_engine(*conv_instance, + reference_conv_fwd_fun); + auto best_conf = run_engine.Profile( + conv::ConvolutionFwdInstances::template Get(), + nrepeat, + do_verification, + do_log); + + std::cout << "Best configuration parameters:" + << "\nname: " << best_conf.best_op_name << "\navg_time: " << best_conf.best_avg_time + << "\ntflops: " << best_conf.best_tflops << "\nGB/s: " << best_conf.best_gb_per_sec + << std::endl; +} + +template +void profile_convnd_instances(ConvDataType data_type, + ConvDataLayout data_layout, + const ck::utils::conv::ConvParams& params, + bool do_verification, + bool do_log, + int nrepeat, + int init_method) +{ + switch(data_layout) + { + case ConvDataLayout::NHWC: { + switch(data_type) + { + case ConvDataType::F32_F32_F32: + profile_convnd_instances_impl( + params, + do_verification, + do_log, + nrepeat, + init_method, + ConvolutionLayouts{}); + break; + case ConvDataType::F16_F16_F16: + profile_convnd_instances_impl( + params, + do_verification, + do_log, + nrepeat, + init_method, + ConvolutionLayouts{}); + break; + case ConvDataType::BF16_BF16_BF16: + profile_convnd_instances_impl( + params, + do_verification, + do_log, + nrepeat, + init_method, + ConvolutionLayouts{}); + break; + case ConvDataType::INT8_INT8_INT8: + profile_convnd_instances_impl( + params, + do_verification, + do_log, + nrepeat, + init_method, + ConvolutionLayouts{}); + break; + } + break; + } + case ConvDataLayout::NCHW: { + switch(data_type) + { + case ConvDataType::F32_F32_F32: + profile_convnd_instances_impl( + params, + do_verification, + do_log, + nrepeat, + init_method, + ConvolutionLayouts{}); + break; + case ConvDataType::F16_F16_F16: + profile_convnd_instances_impl( + params, + do_verification, + do_log, + nrepeat, + init_method, + ConvolutionLayouts{}); + break; + case ConvDataType::BF16_BF16_BF16: + profile_convnd_instances_impl( + params, + do_verification, + do_log, + nrepeat, + init_method, + ConvolutionLayouts{}); + break; + case ConvDataType::INT8_INT8_INT8: + profile_convnd_instances_impl( + params, + do_verification, + do_log, + nrepeat, + init_method, + ConvolutionLayouts{}); + break; + } + break; + } + } +} + +} // namespace + +int ck::profiler::profile_convnd_fwd(int argc, char* argv[]) +{ + using namespace ck::utils::conv; + + ConvDataType data_type{ConvDataType::F32_F32_F32}; + ConvDataLayout data_layout{ConvDataLayout::NHWC}; + bool do_verification{true}; + int init_method{2}; + bool do_log{false}; + int nrepeat{100}; + int num_dim_spatial{2}; + ConvParams params; + + if(argc >= 4) + { + data_type = static_cast(std::stoi(argv[2])); + data_layout = static_cast(std::stoi(argv[3])); + } + if(argc >= 9) + { + do_verification = std::stoi(argv[4]); + init_method = std::stoi(argv[5]); + do_log = std::stoi(argv[6]); + nrepeat = std::stoi(argv[7]); + num_dim_spatial = std::stoi(argv[8]); + } + if(argc >= 10) + { + params = parse_params(num_dim_spatial, argc, argv); + } + + // TODO Print nice message what is being profiled. + + switch(num_dim_spatial) + { + case 1: + profile_convnd_instances<1>( + data_type, data_layout, params, do_verification, do_log, nrepeat, init_method); + break; + case 2: + profile_convnd_instances<2>( + data_type, data_layout, params, do_verification, do_log, nrepeat, init_method); + break; + case 3: + profile_convnd_instances<3>( + data_type, data_layout, params, do_verification, do_log, nrepeat, init_method); + break; + default: + throw std::runtime_error("profile_conv_fwd: unsupported num_dim_spatial value: " + + std::to_string(num_dim_spatial)); + } + + return 1; +} diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index 3cd454e351..2a8078ca5f 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -4,6 +4,8 @@ #include #include +#include "profile_convnd_fwd.hpp" + int profile_gemm(int, char*[]); int profile_gemm_bias_2d(int, char*[]); int profile_gemm_bias_relu(int, char*[]); @@ -11,7 +13,6 @@ int profile_gemm_bias_relu_add(int, char*[]); int profile_gemm_reduce(int, char*[]); int profile_batched_gemm(int, char*[]); int profile_grouped_gemm(int, char*[]); -int profile_conv_fwd(int, char*[]); int profile_conv_fwd_bias_relu(int, char*[]); int profile_conv_fwd_bias_relu_add(int, char*[]); int profile_conv_fwd_bias_relu_atomic_add(int, char*[]); @@ -56,7 +57,7 @@ int main(int argc, char* argv[]) } else if(strcmp(argv[1], "conv_fwd") == 0) { - return profile_conv_fwd(argc, argv); + return ck::profiler::profile_convnd_fwd(argc, argv); } else if(strcmp(argv[1], "conv_fwd_bias_relu") == 0) { diff --git a/test/conv2d_bwd_weight/CMakeLists.txt b/test/conv2d_bwd_weight/CMakeLists.txt index 72e40d3eec..7b515b6b8e 100644 --- a/test/conv2d_bwd_weight/CMakeLists.txt +++ b/test/conv2d_bwd_weight/CMakeLists.txt @@ -4,5 +4,4 @@ include_directories(BEFORE ) add_test_executable(test_conv2d_bwd_weight conv2d_bwd_weight.cpp) -target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor) -target_link_libraries(test_conv2d_bwd_weight PRIVATE device_conv2d_bwd_weight_instance) +target_link_libraries(test_conv2d_bwd_weight PRIVATE host_tensor device_conv2d_bwd_weight_instance conv_fwd_util) diff --git a/test/conv_util/CMakeLists.txt b/test/conv_util/CMakeLists.txt index 784f63ea6f..e3ba9574a2 100644 --- a/test/conv_util/CMakeLists.txt +++ b/test/conv_util/CMakeLists.txt @@ -1,2 +1,2 @@ add_test_executable(test_conv_util conv_util.cpp) -target_link_libraries(test_conv_util PRIVATE host_tensor) +target_link_libraries(test_conv_util PRIVATE host_tensor conv_fwd_util) diff --git a/test/convnd_bwd_data/CMakeLists.txt b/test/convnd_bwd_data/CMakeLists.txt index 4b45ec0fbf..58e6e7d3d0 100644 --- a/test/convnd_bwd_data/CMakeLists.txt +++ b/test/convnd_bwd_data/CMakeLists.txt @@ -4,5 +4,4 @@ include_directories(BEFORE ) add_test_executable(test_convnd_bwd_data convnd_bwd_data.cpp) -target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor) -target_link_libraries(test_convnd_bwd_data PRIVATE device_convnd_bwd_data_instance) +target_link_libraries(test_convnd_bwd_data PRIVATE host_tensor device_convnd_bwd_data_instance conv_fwd_util) diff --git a/test/convnd_fwd/CMakeLists.txt b/test/convnd_fwd/CMakeLists.txt index 4608cdbe86..442c45dc8c 100644 --- a/test/convnd_fwd/CMakeLists.txt +++ b/test/convnd_fwd/CMakeLists.txt @@ -1,17 +1,15 @@ add_custom_target(test_convnd_fwd) add_test_executable(test_conv1d_fwd conv1d_fwd.cpp) -target_link_libraries(test_conv1d_fwd PRIVATE host_tensor) -target_link_libraries(test_conv1d_fwd PRIVATE device_conv1d_fwd_instance) +target_link_libraries(test_conv1d_fwd PRIVATE host_tensor device_conv1d_fwd_instance conv_fwd_util) +target_link_libraries(test_conv1d_fwd PRIVATE ) add_dependencies(test_convnd_fwd test_conv1d_fwd) add_test_executable(test_conv2d_fwd conv2d_fwd.cpp) -target_link_libraries(test_conv2d_fwd PRIVATE host_tensor) -target_link_libraries(test_conv2d_fwd PRIVATE device_conv2d_fwd_instance) +target_link_libraries(test_conv2d_fwd PRIVATE host_tensor device_conv2d_fwd_instance conv_fwd_util) add_dependencies(test_convnd_fwd test_conv2d_fwd) add_test_executable(test_conv3d_fwd conv3d_fwd.cpp) -target_link_libraries(test_conv3d_fwd PRIVATE host_tensor) -target_link_libraries(test_conv3d_fwd PRIVATE device_conv3d_fwd_instance) +target_link_libraries(test_conv3d_fwd PRIVATE host_tensor device_conv3d_fwd_instance conv_fwd_util) add_dependencies(test_convnd_fwd test_conv3d_fwd) diff --git a/test/convnd_fwd/conv1d_fwd.cpp b/test/convnd_fwd/conv1d_fwd.cpp index e6df0e6f8c..df3b3a2945 100644 --- a/test/convnd_fwd/conv1d_fwd.cpp +++ b/test/convnd_fwd/conv1d_fwd.cpp @@ -7,37 +7,15 @@ #include "element_wise_operation.hpp" #include "conv_fwd_util.hpp" #include "conv_util.hpp" -#include "host_tensor.hpp" -#include "tensor_layout.hpp" -#include "check_err.hpp" - -// Forward declarations for conv instances. - -using DeviceConvFwdNoOpPtr = - ck::tensor_operation::device::DeviceConvFwdPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_conv1d_fwd_instance { - -void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(std::vector&); -void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(std::vector&); -void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(std::vector&); -void add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(std::vector&); - -} // namespace device_conv1d_fwd_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck namespace { bool test_conv1D_nwc() { - bool res{true}; + using namespace std::placeholders; + using namespace ck::utils; + namespace ctl = ck::tensor_layout::convolution; + ck::utils::conv::ConvParams params; params.num_dim_spatial = 1; params.N = 2; @@ -50,30 +28,26 @@ bool test_conv1D_nwc() params.input_left_pads = std::vector{1}; params.input_right_pads = std::vector{1}; - auto host_tensors = - ck::utils::conv::get_host_tensors(params); - const Tensor& input = std::get<0>(host_tensors); - const Tensor& weights = std::get<1>(host_tensors); - Tensor& host_output = std::get<2>(host_tensors); - Tensor& device_output = std::get<3>(host_tensors); + std::vector conv_ptrs; + test::conv::get_test_convolution_fwd_instance<1>(conv_ptrs); + conv::ConvFwdOpInstance conv_instance( + params); - ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output); - test::conv::RunConv<1>(params, input, weights, device_output); - res = res && - ck::utils::check_err( - device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); - - return res; + auto reference_conv_fwd_fun = std::bind( + conv::run_reference_convolution_forward<1, float, float, float>, params, _1, _2, _3); + OpInstanceRunEngine run_engine(conv_instance, reference_conv_fwd_fun); + run_engine.SetAtol(1e-5); + run_engine.SetRtol(1e-4); + return run_engine.Test(conv_ptrs); } template -bool test_conv1d_nwc_instances(const std::vector& conv_ptrs) +bool test_conv1d_nwc_instances(const std::vector& conv_ptrs) { + using namespace std::placeholders; + using namespace ck::utils; + namespace ctl = ck::tensor_layout::convolution; + ck::utils::conv::ConvParams params; params.num_dim_spatial = 1; params.filter_spatial_lengths = std::vector{3}; @@ -83,52 +57,36 @@ bool test_conv1d_nwc_instances(const std::vector& conv_ptr params.input_left_pads = std::vector{1}; params.input_right_pads = std::vector{1}; - auto host_tensors = - ck::utils::conv::get_host_tensors(params); - const Tensor& input = std::get<0>(host_tensors); - const Tensor& weights = std::get<1>(host_tensors); - Tensor& host_output = std::get<2>(host_tensors); - Tensor& device_output = std::get<3>(host_tensors); + conv::ConvFwdOpInstance conv_instance(params); - ck::utils::conv::run_reference_convolution_forward<1>(params, input, weights, host_output); - return ck::utils::conv::run_convolution_forward_instances<1>( - params, conv_ptrs, input, weights, device_output, host_output); + auto reference_conv_fwd_fun = + std::bind(conv::run_reference_convolution_forward<1, T, T, T>, params, _1, _2, _3); + OpInstanceRunEngine run_engine(conv_instance, reference_conv_fwd_fun); + return run_engine.Test(conv_ptrs); } + bool test_conv1d_nwc_bf16_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv1d_fwd_instance:: - add_device_conv1d_fwd_xdl_nwc_kxc_nwk_bf16_instances(conv_ptrs); - return test_conv1d_nwc_instances(conv_ptrs); + return test_conv1d_nwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<1>()); } bool test_conv1d_nwc_f16_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv1d_fwd_instance:: - add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f16_instances(conv_ptrs); - return test_conv1d_nwc_instances(conv_ptrs); + return test_conv1d_nwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<1>()); } bool test_conv1d_nwc_f32_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv1d_fwd_instance:: - add_device_conv1d_fwd_xdl_nwc_kxc_nwk_f32_instances(conv_ptrs); - return test_conv1d_nwc_instances(conv_ptrs); + return test_conv1d_nwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<1>()); } bool test_conv1d_nwc_int8_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv1d_fwd_instance:: - add_device_conv1d_fwd_xdl_nwc_kxc_nwk_int8_instances(conv_ptrs); - return test_conv1d_nwc_instances(conv_ptrs); + return test_conv1d_nwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<1>()); } } // anonymous namespace @@ -149,7 +107,7 @@ int main() std::cout << "\ntest_conv1d_nwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; res = test_conv1d_nwc_int8_instances(); - std::cout << "\ntes_tconv1_dnw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE") + std::cout << "\ntest_conv1d_nwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; return res ? 0 : 1; diff --git a/test/convnd_fwd/conv2d_fwd.cpp b/test/convnd_fwd/conv2d_fwd.cpp index 2a46d74495..f35c69bbd0 100644 --- a/test/convnd_fwd/conv2d_fwd.cpp +++ b/test/convnd_fwd/conv2d_fwd.cpp @@ -1,6 +1,5 @@ #include #include -#include #include #include @@ -8,38 +7,14 @@ #include "element_wise_operation.hpp" #include "conv_fwd_util.hpp" #include "conv_util.hpp" -#include "host_tensor.hpp" -#include "tensor_layout.hpp" -#include "check_err.hpp" - -// Forward declarations for conv instances. -using DeviceConvFwdNoOpPtr = - ck::tensor_operation::device::DeviceConvFwdPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_conv2d_fwd_instance { - -void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(std::vector&); -void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(std::vector&); -void add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances( - std::vector&); -void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(std::vector&); -void add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(std::vector&); - -} // namespace device_conv2d_fwd_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck namespace { bool test_conv2d_nhwc() { - bool res{true}; + using namespace std::placeholders; + using namespace ck::utils; + ck::utils::conv::ConvParams params; params.N = 2; params.K = 16; @@ -47,25 +22,25 @@ bool test_conv2d_nhwc() params.input_spatial_lengths = std::vector{16, 16}; params.conv_filter_strides = std::vector{1, 1}; - auto host_tensors = ck::utils::conv::get_host_tensors(params); - const Tensor& input = std::get<0>(host_tensors); - const Tensor& weights = std::get<1>(host_tensors); - Tensor& host_output = std::get<2>(host_tensors); - Tensor& device_output = std::get<3>(host_tensors); + std::vector conv_ptrs; + test::conv::get_test_convolution_fwd_instance<2>(conv_ptrs); + conv::ConvFwdOpInstance conv_instance(params); - ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output); - test::conv::RunConv<2>(params, input, weights, device_output); - res = res && - ck::utils::check_err( - device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); - - return res; + auto reference_conv_fwd_fun = std::bind( + conv::run_reference_convolution_forward<2, float, float, float>, params, _1, _2, _3); + OpInstanceRunEngine run_engine(conv_instance, reference_conv_fwd_fun); + run_engine.SetAtol(1e-5); + run_engine.SetRtol(1e-4); + return run_engine.Test(conv_ptrs); } template -bool test_conv2d_nhwc_instances(const std::vector& conv_ptrs) +bool test_conv2d_nhwc_instances(const std::vector& conv_ptrs) { - ck::utils::conv::ConvParams params; + using namespace std::placeholders; + using namespace ck::utils; + + conv::ConvParams params; params.num_dim_spatial = 2; params.filter_spatial_lengths = std::vector{3, 3}; params.input_spatial_lengths = std::vector{71, 71}; @@ -74,55 +49,36 @@ bool test_conv2d_nhwc_instances(const std::vector& conv_pt params.input_left_pads = std::vector{1, 1}; params.input_right_pads = std::vector{1, 1}; - auto host_tensors = - ck::utils::conv::get_host_tensors(params); - const Tensor& input = std::get<0>(host_tensors); - const Tensor& weights = std::get<1>(host_tensors); - Tensor& host_output = std::get<2>(host_tensors); - Tensor& device_output = std::get<3>(host_tensors); + conv::ConvFwdOpInstance conv_instance(params); - ck::utils::conv::run_reference_convolution_forward<2>(params, input, weights, host_output); - return ck::utils::conv::run_convolution_forward_instances<2>( - params, conv_ptrs, input, weights, device_output, host_output); + auto reference_conv_fwd_fun = + std::bind(conv::run_reference_convolution_forward<2, T, T, T>, params, _1, _2, _3); + OpInstanceRunEngine run_engine(conv_instance, reference_conv_fwd_fun); + return run_engine.Test(conv_ptrs); } bool test_conv2d_nhwc_bf16_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_bf16_instances(conv_ptrs); - return test_conv2d_nhwc_instances(conv_ptrs); + return test_conv2d_nhwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<2>()); } bool test_conv2d_nhwc_f16_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs); - return test_conv2d_nhwc_instances(conv_ptrs); + return test_conv2d_nhwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<2>()); } bool test_conv2d_nhwc_f32_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f32_instances(conv_ptrs); - return test_conv2d_nhwc_instances(conv_ptrs); + return test_conv2d_nhwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<2>()); } bool test_conv2d_nhwc_int8_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv2d_fwd_instance:: - add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_int8_instances(conv_ptrs); - return test_conv2d_nhwc_instances(conv_ptrs); + return test_conv2d_nhwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<2>()); } } // anonymous namespace diff --git a/test/convnd_fwd/conv3d_fwd.cpp b/test/convnd_fwd/conv3d_fwd.cpp index 3dc1a6b160..2375148753 100644 --- a/test/convnd_fwd/conv3d_fwd.cpp +++ b/test/convnd_fwd/conv3d_fwd.cpp @@ -8,37 +8,16 @@ #include "element_wise_operation.hpp" #include "conv_fwd_util.hpp" #include "conv_util.hpp" -#include "host_tensor.hpp" -#include "tensor_layout.hpp" -#include "check_err.hpp" - -// Forward declarations for conv instances. -using DeviceConvFwdNoOpPtr = - ck::tensor_operation::device::DeviceConvFwdPtr; - -namespace ck { -namespace tensor_operation { -namespace device { -namespace device_conv3d_fwd_instance { - -void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(std::vector&); -void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(std::vector&); -void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(std::vector&); -void add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(std::vector&); - -} // namespace device_conv3d_fwd_instance -} // namespace device -} // namespace tensor_operation -} // namespace ck namespace { bool test_conv3d_ndhwc() { - bool res{true}; - ck::utils::conv::ConvParams params; + using namespace std::placeholders; + using namespace ck::utils; + namespace ctl = ck::tensor_layout::convolution; + + conv::ConvParams params; params.num_dim_spatial = 3; params.N = 2; params.K = 16; @@ -50,31 +29,26 @@ bool test_conv3d_ndhwc() params.input_left_pads = std::vector{1, 1, 1}; params.input_right_pads = std::vector{1, 1, 1}; - auto host_tensors = - ck::utils::conv::get_host_tensors(params); - const Tensor& input = std::get<0>(host_tensors); - const Tensor& weights = std::get<1>(host_tensors); - Tensor& host_output = std::get<2>(host_tensors); - Tensor& device_output = std::get<3>(host_tensors); + std::vector conv_ptrs; + test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs); + conv::ConvFwdOpInstance conv_instance( + params); - ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output); - test::conv::RunConv<3>(params, input, weights, device_output); - res = res && - ck::utils::check_err( - device_output.mData, host_output.mData, "Error: incorrect results!", 1e-5f, 1e-4f); - - return res; + auto reference_conv_fwd_fun = std::bind( + conv::run_reference_convolution_forward<3, float, float, float>, params, _1, _2, _3); + OpInstanceRunEngine run_engine(conv_instance, reference_conv_fwd_fun); + run_engine.SetAtol(1e-5); + run_engine.SetRtol(1e-4); + return run_engine.Test(conv_ptrs); } bool test_conv3d_ndhwc_2gb_input() { + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + using namespace ck::utils; + // >2GB Input - ck::utils::conv::ConvParams params; + conv::ConvParams params; params.num_dim_spatial = 3; params.N = 2; params.K = 16; @@ -86,39 +60,35 @@ bool test_conv3d_ndhwc_2gb_input() params.input_left_pads = std::vector{1, 1, 1}; params.input_right_pads = std::vector{1, 1, 1}; - auto host_tensors = - ck::utils::conv::get_host_tensors(params, false); - const Tensor& input = std::get<0>(host_tensors); - const Tensor& weights = std::get<1>(host_tensors); - Tensor& device_output = std::get<3>(host_tensors); + std::vector conv_ptrs; + test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs); - try - { - test::conv::RunConv<3>(params, input, weights, device_output); - } - catch(const std::runtime_error& err) - { - std::string err_msg{"Error! device_conv with the specified compilation parameters does " - "not support this Conv problem"}; - if(err.what() != err_msg) - { - return false; - } - return true; - } - std::cout << "Error: Failure checking oversized tensor!" << std::endl; - return false; + auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr, + nullptr, + nullptr, + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + params.GetOutputSpatialLengths(), + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + return !(conv_ptrs.back()->IsSupportedArgument(arg.get())); } bool test_conv3d_ndhwc_2gb_filters() { + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + using namespace ck::utils; + // >2GB Filters - ck::utils::conv::ConvParams params; + conv::ConvParams params; params.num_dim_spatial = 3; params.N = 2; params.K = 16; @@ -130,39 +100,35 @@ bool test_conv3d_ndhwc_2gb_filters() params.input_left_pads = std::vector{1, 1, 1}; params.input_right_pads = std::vector{1, 1, 1}; - auto host_tensors = - ck::utils::conv::get_host_tensors(params, false); - const Tensor& input = std::get<0>(host_tensors); - const Tensor& weights = std::get<1>(host_tensors); - Tensor& device_output = std::get<3>(host_tensors); + std::vector conv_ptrs; + test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs); - try - { - test::conv::RunConv<3>(params, input, weights, device_output); - } - catch(const std::runtime_error& err) - { - std::string err_msg{"Error! device_conv with the specified compilation parameters does " - "not support this Conv problem"}; - if(err.what() != err_msg) - { - return false; - } - return true; - } - std::cout << "Error: Failure checking oversized tensor!" << std::endl; - return false; + auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr, + nullptr, + nullptr, + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + params.GetOutputSpatialLengths(), + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + return !(conv_ptrs.back()->IsSupportedArgument(arg.get())); } bool test_conv3d_ndhwc_2gb_output() { + using PassThrough = ck::tensor_operation::element_wise::PassThrough; + using namespace ck::utils; + // >2GB Output - ck::utils::conv::ConvParams params; + conv::ConvParams params; params.num_dim_spatial = 3; params.N = 2; params.K = 16; @@ -174,39 +140,35 @@ bool test_conv3d_ndhwc_2gb_output() params.input_left_pads = std::vector{2, 2, 2}; params.input_right_pads = std::vector{2, 2, 2}; - auto host_tensors = - ck::utils::conv::get_host_tensors(params, false); - const Tensor& input = std::get<0>(host_tensors); - const Tensor& weights = std::get<1>(host_tensors); - Tensor& device_output = std::get<3>(host_tensors); - - try - { - test::conv::RunConv<3>(params, input, weights, device_output); - } - catch(const std::runtime_error& err) - { - std::string err_msg{"Error! device_conv with the specified compilation parameters does " - "not support this Conv problem"}; - if(err.what() != err_msg) - { - return false; - } - return true; - } - std::cout << "Error: Failure checking oversized tensor!" << std::endl; - return false; + std::vector conv_ptrs; + test::conv::get_test_convolution_fwd_instance<3>(conv_ptrs); + auto arg = conv_ptrs.back()->MakeArgumentPointer(nullptr, + nullptr, + nullptr, + params.N, + params.K, + params.C, + params.input_spatial_lengths, + params.filter_spatial_lengths, + params.GetOutputSpatialLengths(), + params.conv_filter_strides, + params.conv_filter_dilations, + params.input_left_pads, + params.input_right_pads, + PassThrough{}, + PassThrough{}, + PassThrough{}); + return !(conv_ptrs.back()->IsSupportedArgument(arg.get())); } template -bool test_conv3d_ndhwc_instances(const std::vector& conv_ptrs) +bool test_conv3d_ndhwc_instances(const std::vector& conv_ptrs) { - ck::utils::conv::ConvParams params; + using namespace std::placeholders; + using namespace ck::utils; + namespace ctl = ck::tensor_layout::convolution; + + conv::ConvParams params; params.N = 64; params.num_dim_spatial = 3; params.filter_spatial_lengths = std::vector{3, 3, 2}; @@ -216,53 +178,36 @@ bool test_conv3d_ndhwc_instances(const std::vector& conv_p params.input_left_pads = std::vector{1, 1, 1}; params.input_right_pads = std::vector{1, 1, 1}; - auto host_tensors = - ck::utils::conv::get_host_tensors(params); - const Tensor& input = std::get<0>(host_tensors); - const Tensor& weights = std::get<1>(host_tensors); - Tensor& host_output = std::get<2>(host_tensors); - Tensor& device_output = std::get<3>(host_tensors); + conv::ConvFwdOpInstance conv_instance(params); - ck::utils::conv::run_reference_convolution_forward<3>(params, input, weights, host_output); - return ck::utils::conv::run_convolution_forward_instances<3>( - params, conv_ptrs, input, weights, device_output, host_output); + auto reference_conv_fwd_fun = + std::bind(conv::run_reference_convolution_forward<3, T, T, T>, params, _1, _2, _3); + OpInstanceRunEngine run_engine(conv_instance, reference_conv_fwd_fun); + return run_engine.Test(conv_ptrs); } bool test_conv3d_ndhwc_bf16_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv3d_fwd_instance:: - add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_bf16_instances(conv_ptrs); - return test_conv3d_ndhwc_instances(conv_ptrs); + return test_conv3d_ndhwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<3>()); } bool test_conv3d_ndhwc_f16_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv3d_fwd_instance:: - add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f16_instances(conv_ptrs); - return test_conv3d_ndhwc_instances(conv_ptrs); + return test_conv3d_ndhwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<3>()); } bool test_conv3d_ndhwc_f32_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv3d_fwd_instance:: - add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_f32_instances(conv_ptrs); - return test_conv3d_ndhwc_instances(conv_ptrs); + return test_conv3d_ndhwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<3>()); } bool test_conv3d_ndhwc_int8_instances() { - std::vector conv_ptrs; - ck::tensor_operation::device::device_conv3d_fwd_instance:: - add_device_conv3d_fwd_xdl_ndhwc_kzyxc_ndhwk_int8_instances(conv_ptrs); - return test_conv3d_ndhwc_instances(conv_ptrs); + return test_conv3d_ndhwc_instances( + ck::utils::conv::ConvolutionFwdInstances::Get<3>()); } } // anonymous namespace @@ -293,7 +238,7 @@ int main() std::cout << "\ntest_conv3d_ndhwc_f32_instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; res = test_conv3d_ndhwc_int8_instances(); - std::cout << "\ntest_conv3d_ndhw_cint_8instances ..... " << (res ? "SUCCESS" : "FAILURE") + std::cout << "\ntest_conv3d_ndhwc_int8_instances ..... " << (res ? "SUCCESS" : "FAILURE") << std::endl; return res ? 0 : 1; diff --git a/test/convnd_fwd/conv_util.hpp b/test/convnd_fwd/conv_util.hpp index d62dab7366..4f77101563 100644 --- a/test/convnd_fwd/conv_util.hpp +++ b/test/convnd_fwd/conv_util.hpp @@ -10,7 +10,8 @@ #include "host_tensor.hpp" #include "sequence.hpp" -namespace { +namespace test { +namespace conv { template using S = ck::Sequence; @@ -19,6 +20,9 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough; +using DeviceConvFwdNoOpPtr = + ck::tensor_operation::device::DeviceConvFwdPtr; + static constexpr auto ConvFwdDefault = ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; @@ -62,26 +66,14 @@ using DeviceConvNDFwdInstance = ck::tensor_operation::device:: 1>; // CThreadTransferDstScalarPerVector // clang-format on -} // namespace - -namespace test { -namespace conv { - template -void RunConv(const ck::utils::conv::ConvParams& params, - const Tensor& input, - const Tensor& weights, - Tensor& output) +void get_test_convolution_fwd_instance(std::vector& instances) { - ck::utils::conv::run_convolution_forward( - params, input, weights, output); + using ConvInstanceT = DeviceConvNDFwdInstance; + instances.emplace_back(std::make_unique()); } } // namespace conv diff --git a/test/reference_conv_fwd/CMakeLists.txt b/test/reference_conv_fwd/CMakeLists.txt index bd9140909c..9d0bf45ef5 100644 --- a/test/reference_conv_fwd/CMakeLists.txt +++ b/test/reference_conv_fwd/CMakeLists.txt @@ -1,2 +1,2 @@ add_test_executable(test_reference_conv_fwd reference_conv_fwd.cpp) -target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor) +target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor conv_fwd_util) diff --git a/test/reference_conv_fwd/reference_conv_fwd.cpp b/test/reference_conv_fwd/reference_conv_fwd.cpp index d852e8f5eb..e163298041 100644 --- a/test/reference_conv_fwd/reference_conv_fwd.cpp +++ b/test/reference_conv_fwd/reference_conv_fwd.cpp @@ -1,4 +1,3 @@ -#include #include #include #include @@ -10,6 +9,7 @@ #include "config.hpp" #include "conv_fwd_util.hpp" #include "element_wise_operation.hpp" +#include "fill.hpp" #include "host_tensor.hpp" #include "reference_conv_fwd.hpp" #include "tensor_layout.hpp" @@ -19,35 +19,6 @@ using InElementOp = ck::tensor_operation::element_wise::PassThrough; using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; using OutElementOp = ck::tensor_operation::element_wise::PassThrough; -template -struct FillMonotonicSeq -{ - T m_init_value{0}; - T m_step{1}; - - template - void operator()(ForwardIter first, ForwardIter last) const - { - std::generate(first, last, [=, n = m_init_value]() mutable { - auto tmp = n; - n += m_step; - return tmp; - }); - } -}; - -template -struct FillConstant -{ - T m_value{0}; - - template - void operator()(ForwardIter first, ForwardIter last) const - { - std::fill(first, last, m_value); - } -}; - template , - typename FillWeightsOp = FillConstant> + typename FillInputOp = ck::utils::FillMonotonicSeq, + typename FillWeightsOp = ck::utils::FillConstant> Tensor run_reference_convolution_forward(const ck::utils::conv::ConvParams& params, const FillInputOp& fill_input_op = FillInputOp{}, @@ -251,7 +222,7 @@ bool test_conv1d_nwc() ck::tensor_layout::convolution::NWC, ck::tensor_layout::convolution::KXC, ck::tensor_layout::convolution::NWK>( - params, FillMonotonicSeq{0.f, 0.1f}); + params, ck::utils::FillMonotonicSeq{0.f, 0.1f}); ref_dims = std::vector{2, 16, 16}; ref_data = std::vector{ @@ -349,7 +320,7 @@ bool test_conv3d_ncdhw() ck::tensor_layout::convolution::NCDHW, ck::tensor_layout::convolution::KCZYX, ck::tensor_layout::convolution::NKDHW>( - params, FillMonotonicSeq{0.f, 0.1f}); + params, ck::utils::FillMonotonicSeq{0.f, 0.1f}); std::vector ref_dims{1, 1, 4, 4, 4}; std::vector ref_data{ 407.7, 410.40002, 413.09998, 415.80002, 423.90002, 426.6, 429.30002, 432., @@ -383,7 +354,7 @@ bool test_conv3d_ncdhw() ck::tensor_layout::convolution::NCDHW, ck::tensor_layout::convolution::KCZYX, ck::tensor_layout::convolution::NKDHW>( - params, FillMonotonicSeq{0.f, 0.1f}); + params, ck::utils::FillMonotonicSeq{0.f, 0.1f}); ref_dims = std::vector{1, 2, 4, 4, 4}; ref_data = std::vector{ 2756.7002, 2764.7998, 2772.9001, 2781., 2853.9001, 2862., 2870.1, 2878.2002,