From 116e10532d521c029aa8c51b8268251bb77088f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Tue, 31 Oct 2023 10:46:32 +0100 Subject: [PATCH] Add support for groups in Img2Col/Col2Img (#1007) * Add support for groups in Img2Col/Col2Img * Fix interface test * Fix interface test G to N * Improve performance * Change gemm layout to 3d * Fixes [ROCm/composable_kernel commit: 2e824c6d46acfd8690a155186be5c2dd4c6648ab] --- .../22_im2col_col2im/column_to_image.cpp | 18 +- .../22_im2col_col2im/image_to_column.cpp | 18 +- .../52_im2col_col2im/column_to_image_f32.cpp | 13 +- .../52_im2col_col2im/image_to_column_f32.cpp | 13 +- .../device/device_conv_tensor_rearrange.hpp | 17 +- .../impl/device_column_to_image_impl.hpp | 102 ++++-- .../impl/device_image_to_column_impl.hpp | 91 +++-- .../gpu/grid/gridwise_tensor_rearrange.hpp | 43 ++- .../cpu/reference_column_to_image.hpp | 41 +-- .../cpu/reference_image_to_column.hpp | 37 +- .../gpu/conv_tensor_rearrange.hpp | 333 +++++++++++++++--- .../gpu/column_to_image/CMakeLists.txt | 9 +- ...ce_column_to_image_gndhwc_3d_instance.cpp} | 8 +- ...ice_column_to_image_gnhwc_2d_instance.cpp} | 8 +- ...vice_column_to_image_gnwc_1d_instance.cpp} | 8 +- ...ice_column_to_image_ndhwgc_3d_instance.cpp | 62 ++++ ...vice_column_to_image_nhwgc_2d_instance.cpp | 62 ++++ ...evice_column_to_image_nwgc_1d_instance.cpp | 61 ++++ .../gpu/image_to_column/CMakeLists.txt | 9 +- ...ce_image_to_column_gndhwc_3d_instance.cpp} | 10 +- ...ice_image_to_column_gnhwc_2d_instance.cpp} | 10 +- ...vice_image_to_column_gnwc_1d_instance.cpp} | 10 +- ...ice_image_to_column_ndhwgc_3d_instance.cpp | 62 ++++ ...vice_image_to_column_nhwgc_2d_instance.cpp | 62 ++++ ...evice_image_to_column_nwgc_1d_instance.cpp | 61 ++++ profiler/README.md | 3 +- .../profile_conv_tensor_rearrange_impl.hpp | 31 +- .../src/profile_conv_tensor_rearrange.cpp | 133 ++++++- .../test_conv_tensor_rearrange.cpp | 46 +-- .../test_conv_tensor_rearrange_interface.cpp | 14 +- 30 files changed, 1114 insertions(+), 281 deletions(-) rename library/src/tensor_operation_instance/gpu/column_to_image/{device_column_to_image_nhwc_3d_instance.cpp => device_column_to_image_gndhwc_3d_instance.cpp} (87%) rename library/src/tensor_operation_instance/gpu/column_to_image/{device_column_to_image_nhwc_2d_instance.cpp => device_column_to_image_gnhwc_2d_instance.cpp} (88%) rename library/src/tensor_operation_instance/gpu/column_to_image/{device_column_to_image_nhwc_1d_instance.cpp => device_column_to_image_gnwc_1d_instance.cpp} (88%) create mode 100644 library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwgc_2d_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nwgc_1d_instance.cpp rename library/src/tensor_operation_instance/gpu/image_to_column/{device_image_to_column_nhwc_3d_instance.cpp => device_image_to_column_gndhwc_3d_instance.cpp} (83%) rename library/src/tensor_operation_instance/gpu/image_to_column/{device_image_to_column_nhwc_2d_instance.cpp => device_image_to_column_gnhwc_2d_instance.cpp} (84%) rename library/src/tensor_operation_instance/gpu/image_to_column/{device_image_to_column_nhwc_1d_instance.cpp => device_image_to_column_gnwc_1d_instance.cpp} (84%) create mode 100644 library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_ndhwgc_3d_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwgc_2d_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nwgc_1d_instance.cpp diff --git a/client_example/22_im2col_col2im/column_to_image.cpp b/client_example/22_im2col_col2im/column_to_image.cpp index 43338ce408..9ebe63198f 100644 --- a/client_example/22_im2col_col2im/column_to_image.cpp +++ b/client_example/22_im2col_col2im/column_to_image.cpp @@ -16,10 +16,10 @@ using InDataType = ck::half_t; using OutDataType = ck::half_t; -using ImageLayout = ck::tensor_layout::convolution::GNHWC; +using ImageLayout = ck::tensor_layout::convolution::NHWGC; static constexpr ck::index_t NumDimSpatial = 2; -static constexpr ck::index_t G = 1; +static constexpr ck::index_t G = 2; static constexpr ck::index_t N = 32; // batch size static constexpr ck::index_t C = 32; // input channel (per group) static constexpr ck::index_t Y = 3; // filter H @@ -52,18 +52,18 @@ int main() std::array wei_spatial_lengths{Y, X}; std::array out_spatial_lengths{Ho, Wo}; - // We have NHWGC in memory space (G is dummy) - // However, CK's API only accept length and stride with order of GNCHW - // Hence, we need to adjust the order of stride + // We have NHWGC in memory space + // However, CK's API only accepts lengths and strides with order of GNCHW. + // Hence, we need to adjust the order of strides. std::array image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; - std::array gemm_strides{Y * X * C, 1}; + std::array gemm_strides{Y * X * C, G * Y * X * C, 1}; std::array filter_strides{1, 1}; std::array filter_dilations{1, 1}; std::array input_left_pads{1, 1}; std::array input_right_pads{1, 1}; - SimpleDeviceMem in(sizeof(InDataType) * N * Ho * Wo * Y * X * C); + SimpleDeviceMem in(sizeof(InDataType) * G * N * Ho * Wo * Y * X * C); SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C); using namespace ck::conv_tensor_rearrange_op; @@ -93,6 +93,7 @@ int main() auto& op_ptr = op_ptrs[i]; auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), out.GetDeviceBuffer(), + G, N, C, in_spatial_lengths, @@ -112,7 +113,7 @@ int main() float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C + - sizeof(OutDataType) * N * Ho * Wo * Y * X * C; + sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C; float gb_per_sec = num_bytes / 1.E6 / avg_time; @@ -149,6 +150,7 @@ int main() << std::endl; auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), out.GetDeviceBuffer(), + G, N, C, in_spatial_lengths, diff --git a/client_example/22_im2col_col2im/image_to_column.cpp b/client_example/22_im2col_col2im/image_to_column.cpp index a1447abf64..8eafbdc5bb 100644 --- a/client_example/22_im2col_col2im/image_to_column.cpp +++ b/client_example/22_im2col_col2im/image_to_column.cpp @@ -16,10 +16,10 @@ using InDataType = ck::half_t; using OutDataType = ck::half_t; -using ImageLayout = ck::tensor_layout::convolution::GNHWC; +using ImageLayout = ck::tensor_layout::convolution::NHWGC; static constexpr ck::index_t NumDimSpatial = 2; -static constexpr ck::index_t G = 1; +static constexpr ck::index_t G = 2; static constexpr ck::index_t N = 32; // batch size static constexpr ck::index_t C = 32; // input channel (per group) static constexpr ck::index_t Y = 3; // filter H @@ -52,11 +52,11 @@ int main() std::array wei_spatial_lengths{Y, X}; std::array out_spatial_lengths{Ho, Wo}; - // We have NHWGC in memory space (G is dummy) - // However, CK's API only accept length and stride with order of GNCHW - // Hence, we need to adjust the order of stride + // We have NHWGC in memory space + // However, CK's API only accepts lengths and strides with order of GNCHW. + // Hence, we need to adjust the order of strides. std::array image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; - std::array gemm_strides{Y * X * C, 1}; + std::array gemm_strides{Y * X * C, G * Y * X * C, 1}; std::array filter_strides{1, 1}; std::array filter_dilations{1, 1}; @@ -64,7 +64,7 @@ int main() std::array input_right_pads{1, 1}; SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C); - SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C); using namespace ck::conv_tensor_rearrange_op; @@ -93,6 +93,7 @@ int main() auto& op_ptr = op_ptrs[i]; auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), out.GetDeviceBuffer(), + G, N, C, in_spatial_lengths, @@ -112,7 +113,7 @@ int main() float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C + - sizeof(OutDataType) * N * Ho * Wo * Y * X * C; + sizeof(OutDataType) * G * N * Ho * Wo * Y * X * C; float gb_per_sec = num_bytes / 1.E6 / avg_time; @@ -149,6 +150,7 @@ int main() << std::endl; auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), out.GetDeviceBuffer(), + G, N, C, in_spatial_lengths, diff --git a/example/52_im2col_col2im/column_to_image_f32.cpp b/example/52_im2col_col2im/column_to_image_f32.cpp index 52144e6885..047f2a2118 100644 --- a/example/52_im2col_col2im/column_to_image_f32.cpp +++ b/example/52_im2col_col2im/column_to_image_f32.cpp @@ -20,7 +20,7 @@ using DeviceColToImgInstance = ck::tensor_operation::device::DeviceColumnToImage bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params) { - + const auto G = conv_params.G_; const auto N = conv_params.N_; const auto C = conv_params.C_; @@ -31,7 +31,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv C * ck::accumulate_n( conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); - const auto in_desc = HostTensorDescriptor({NDoHoWo, CZYX}); + const auto in_desc = HostTensorDescriptor({G, NDoHoWo, CZYX}); const auto out_desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_params); @@ -39,7 +39,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv std::array filter_spatial_lengths{}; std::array output_spatial_lengths{}; std::array image_g_n_c_wis_strides{}; - std::array gemm_m_k_strides{}; + std::array gemm_g_m_k_strides{}; std::array conv_filter_strides{}; std::array conv_filter_dilations{}; std::array input_left_pads{}; @@ -50,7 +50,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv copy(conv_params.input_spatial_lengths_, input_spatial_lengths); copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_params.output_spatial_lengths_, output_spatial_lengths); - copy(in_desc.GetStrides(), gemm_m_k_strides); + copy(in_desc.GetStrides(), gemm_g_m_k_strides); copy(out_desc.GetStrides(), image_g_n_c_wis_strides); copy(conv_params.conv_filter_strides_, conv_filter_strides); copy(conv_params.conv_filter_dilations_, conv_filter_dilations); @@ -86,13 +86,14 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv auto invoker = col2img.MakeInvoker(); auto argument = col2img.MakeArgument(in_device_buf.GetDeviceBuffer(), out_device_buf.GetDeviceBuffer(), + G, N, C, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, image_g_n_c_wis_strides, - gemm_m_k_strides, + gemm_g_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, @@ -108,7 +109,7 @@ bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::Conv } float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); - std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType)); + std::size_t num_btype = G * NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType)); float gb_per_sec = num_btype / 1.E6 / ave_time; std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; diff --git a/example/52_im2col_col2im/image_to_column_f32.cpp b/example/52_im2col_col2im/image_to_column_f32.cpp index 6d883460cc..140bdf8062 100644 --- a/example/52_im2col_col2im/image_to_column_f32.cpp +++ b/example/52_im2col_col2im/image_to_column_f32.cpp @@ -20,7 +20,7 @@ using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumn bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params) { - + const auto G = conv_params.G_; const auto N = conv_params.N_; const auto C = conv_params.C_; @@ -33,13 +33,13 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv const auto in_desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_params); - const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX}); + const auto out_desc = HostTensorDescriptor({G, NDoHoWo, CZYX}); std::array input_spatial_lengths{}; std::array filter_spatial_lengths{}; std::array output_spatial_lengths{}; std::array image_g_n_c_wis_strides{}; - std::array gemm_m_k_strides{}; + std::array gemm_g_m_k_strides{}; std::array conv_filter_strides{}; std::array conv_filter_dilations{}; std::array input_left_pads{}; @@ -51,7 +51,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_params.output_spatial_lengths_, output_spatial_lengths); copy(in_desc.GetStrides(), image_g_n_c_wis_strides); - copy(out_desc.GetStrides(), gemm_m_k_strides); + copy(out_desc.GetStrides(), gemm_g_m_k_strides); copy(conv_params.conv_filter_strides_, conv_filter_strides); copy(conv_params.conv_filter_dilations_, conv_filter_dilations); copy(conv_params.input_left_pads_, input_left_pads); @@ -86,13 +86,14 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv auto invoker = img2col.MakeInvoker(); auto argument = img2col.MakeArgument(in_device_buf.GetDeviceBuffer(), out_device_buf.GetDeviceBuffer(), + G, N, C, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, image_g_n_c_wis_strides, - gemm_m_k_strides, + gemm_g_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, @@ -108,7 +109,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv } float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); - std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType)); + std::size_t num_btype = G * NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType)); float gb_per_sec = num_btype / 1.E6 / ave_time; std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; diff --git a/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp index 898cfe0f2c..b28204fd84 100644 --- a/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp @@ -14,11 +14,12 @@ namespace device { /** * \brief Convolution Tensor Rearrange. * - * This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to - * the gemm problem([N * Do * Ho * Wo, Z * Y * X * C]) (Image to Column) and - * conversion gemm form to the image (Column to Image). - * - * Note that G must be equal to 1. + * This Device operator supports converting an image to + * the GEMM representation (Image to Column) and + * converting a GEMM form to the image (Column to Image). + * Supported layouts: + * [G, N, Di, Hi, Wi, C] <-> [G, N * Do * Ho * Wo, Z * Y * X * C] + * [N, Di, Hi, Wi, G, C] <-> [N * Do * Ho * Wo, G, Z * Y * X * C] * * \tparam NDimSpatial Number of spatial dimensions. * \tparam ImageLayout Input Layout. @@ -39,13 +40,14 @@ struct DeviceConvTensorRearrange : public BaseOperator * * \param p_in A pointer to the device memory of the input image. * \param p_out A pointer to the device memory of the output. + * \param G Convolution number of groups. * \param N Convolution batch size. * \param C Convolution number of channels. * \param input_spatial_lengths Input spatial lengths. * \param filter_spatial_lengths Filter spatial lengths. * \param output_spatial_lengths Output spatial lengths. * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W]. - * \param gemm_m_k_strides Gemm form strides. + * \param gemm_g_m_k_strides Gemm form strides. * \param conv_filter_strides Convolution filter strides. * \param conv_filter_dilations Convolution filter dilations. * \param input_left_pads Convolution left pads. @@ -55,13 +57,14 @@ struct DeviceConvTensorRearrange : public BaseOperator virtual std::unique_ptr MakeArgumentPointer(const void* p_in, void* p_out, + const ck::index_t G, const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, const std::array& image_g_n_c_wis_strides, - const std::array& gemm_m_k_strides, + const std::array& gemm_g_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp index f8b4a01681..567be5f364 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp @@ -17,15 +17,18 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp" #include "ck/host_utility/io.hpp" namespace ck { namespace tensor_operation { namespace device { -// Image to column for input layout NDHWC: -// input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C] -// output : image [N, Di, Hi, Wi, C] +// Column to Image: +// input : gemm form [G, N * Do * Ho * Wo, Z * Y * X * C] +// output : input image [G, N, Di, Hi, Wi, C] +// input : gemm form [N * Do * Ho * Wo, G, Z * Y * X * C] +// output : input image [N, Di, Hi, Wi, G, C] template { + static constexpr bool is_NSpatialGC = + std::is_same_v || + std::is_same_v || + std::is_same_v; + static constexpr bool is_GNSpatialC = + std::is_same_v || + std::is_same_v || + std::is_same_v; static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -90,7 +101,7 @@ struct DeviceColumnToImageImpl const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, const std::array& conv_filter_strides, - const std::array& gemm_m_k_strides, + const std::array& gemm_g_m_k_strides, const std::array& independent_filters, const std::array& effs) { @@ -100,23 +111,23 @@ struct DeviceColumnToImageImpl C * ck::accumulate_n( filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); - const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1]; + const index_t NStride = DoHoWo * gemm_g_m_k_strides[I1] * gemm_g_m_k_strides[I2]; // Calculate the appropriate stride for each set of independent filters // in each dimension - const index_t WStride = - math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0]; + const index_t WStride = math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * + gemm_g_m_k_strides[I1]; const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) * - output_spatial_lengths[XIdx] * gemm_m_k_strides[I0]; + output_spatial_lengths[XIdx] * gemm_g_m_k_strides[I1]; const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] * - gemm_m_k_strides[I0]; + gemm_g_m_k_strides[I1]; // Create descriptor for independent filters in each dimension and // then merge them into column form if constexpr(NDimSpatial == 1) { const auto desc_gemm_form = make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX), - make_tuple(NStride, WStride, gemm_m_k_strides[I1])); + make_tuple(NStride, WStride, gemm_g_m_k_strides[I2])); const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( desc_gemm_form, make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])), @@ -130,7 +141,7 @@ struct DeviceColumnToImageImpl { const auto desc_gemm_form = make_naive_tensor_descriptor( make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX), - make_tuple(NStride, HStride, WStride, gemm_m_k_strides[I1])); + make_tuple(NStride, HStride, WStride, gemm_g_m_k_strides[I2])); const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( desc_gemm_form, make_tuple(make_merge_transform( @@ -149,7 +160,7 @@ struct DeviceColumnToImageImpl independent_filters[YIdx], independent_filters[XIdx], CZYX), - make_tuple(NStride, DStride, HStride, WStride, gemm_m_k_strides[I1])); + make_tuple(NStride, DStride, HStride, WStride, gemm_g_m_k_strides[I2])); const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( desc_gemm_form, make_tuple(make_merge_transform(make_tuple(N, @@ -252,34 +263,38 @@ struct DeviceColumnToImageImpl decltype(BlockToCTileMap_M00_N0_M01Adapt( InputGridDesc{}))>; - using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange; + using GridwiseTensorRearrangeKernel = + GridwiseTensorRearrange>; struct Argument : public BaseArgument { Argument(const void* p_in, // input image void* p_out, // output image + const ck::index_t G, const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, const std::array& image_g_n_c_wis_strides, - const std::array& gemm_m_k_strides, + const std::array& gemm_g_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, const std::array& input_right_pads) - : C_(C), + : G_(G), + C_(C), X_(filter_spatial_lengths[NDimSpatial - I1]), p_in_{static_cast(p_in)}, p_out_{static_cast(p_out)}, @@ -289,6 +304,9 @@ struct DeviceColumnToImageImpl input_left_pads_{input_left_pads}, input_right_pads_{input_right_pads} { + compute_ptr_offset_of_batch_.BatchStrideA_ = gemm_g_m_k_strides[I0]; + compute_ptr_offset_of_batch_.BatchStrideC_ = image_g_n_c_wis_strides[I0]; + const index_t x_eff = (filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1; const index_t y_eff = @@ -354,7 +372,7 @@ struct DeviceColumnToImageImpl filter_spatial_lengths, output_spatial_lengths, conv_filter_strides, - gemm_m_k_strides, + gemm_g_m_k_strides, independent_filters, effs); const auto out_grid_desc_m_k = @@ -387,10 +405,9 @@ struct DeviceColumnToImageImpl // Memory offsets to next set of independent filters, // move to independent filters in each dimension const index_t in_offset = - x_idx * gemm_m_k_strides[0] + - y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] + - z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] * - output_spatial_lengths[XIdx]; + (x_idx + y_idx * output_spatial_lengths[XIdx] + + z_idx * output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx]) * + gemm_g_m_k_strides[I1]; // Move to independent filters in appropriate dimensions const index_t out_offset = x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] + @@ -417,6 +434,7 @@ struct DeviceColumnToImageImpl } } + const ck::index_t G_; const ck::index_t C_; const ck::index_t X_; @@ -434,6 +452,8 @@ struct DeviceColumnToImageImpl std::vector p_in_container_; std::vector p_out_container_; + + ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; }; struct Invoker : public BaseInvoker @@ -451,6 +471,7 @@ struct DeviceColumnToImageImpl OutputGridDesc, OutputDataType, Block2ETileMap, + ComputePtrOffsetOfStridedBatch, GridwiseTensorRearrangeKernel>; // Execute each set of independent filters @@ -460,7 +481,7 @@ struct DeviceColumnToImageImpl BlockToCTileMap_M00_N0_M01Adapt( arg.out_grid_desc_m_k_container_[i]); const index_t grid_size = - block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]); + block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]) * arg.G_; elapsed_time += launch_and_time_kernel(stream_config, kernel, dim3(grid_size), @@ -470,7 +491,9 @@ struct DeviceColumnToImageImpl arg.p_in_container_[i], arg.out_grid_desc_m_k_container_[i], arg.p_out_container_[i], - block_2_tile_map); + arg.G_, + block_2_tile_map, + arg.compute_ptr_offset_of_batch_); } return elapsed_time; } @@ -485,8 +508,7 @@ struct DeviceColumnToImageImpl bool IsSupportedArgument(const Argument& arg) { using namespace tensor_layout::convolution; - if constexpr(!(std::is_same_v || std::is_same_v || - std::is_same_v)) + if constexpr(!(is_NSpatialGC || is_GNSpatialC)) { return false; } @@ -534,13 +556,14 @@ struct DeviceColumnToImageImpl static auto MakeArgument(const void* p_in, // input image void* p_out, // output image + const ck::index_t G, const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, const std::array& image_g_n_c_wis_strides, - const std::array& gemm_m_k_strides, + const std::array& gemm_g_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, @@ -548,13 +571,14 @@ struct DeviceColumnToImageImpl { return Argument{static_cast(p_in), static_cast(p_out), + G, N, C, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, image_g_n_c_wis_strides, - gemm_m_k_strides, + gemm_g_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, @@ -566,13 +590,14 @@ struct DeviceColumnToImageImpl std::unique_ptr MakeArgumentPointer(const void* p_in, // input image void* p_out, // output image + const ck::index_t G, const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, const std::array& image_g_n_c_wis_strides, - const std::array& gemm_m_k_strides, + const std::array& gemm_g_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, @@ -580,13 +605,14 @@ struct DeviceColumnToImageImpl { return std::make_unique(static_cast(p_in), static_cast(p_out), + G, N, C, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, image_g_n_c_wis_strides, - gemm_m_k_strides, + gemm_g_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp index 8b4ae7875c..c83ffdcd26 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp @@ -15,15 +15,18 @@ #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_utils.hpp" #include "ck/host_utility/io.hpp" namespace ck { namespace tensor_operation { namespace device { -// Image to column for input layout NDHWC: -// input : input image [N, Di, Hi, Wi, C] -// output : gemm form [N * Do * Ho * Wo, Z * Y * X * C] +// Image to column: +// input : input image [G, N, Di, Hi, Wi, C] +// output : gemm form [G * N * Do * Ho * Wo, Z * Y * X * C] +// input : input image [N, Di, Hi, Wi, G, C] +// output : gemm form [N * Do * Ho * Wo * G, Z * Y * X * C] template { + static constexpr bool is_NSpatialGC = + std::is_same_v || + std::is_same_v || + std::is_same_v; + static constexpr bool is_GNSpatialC = + std::is_same_v || + std::is_same_v || + std::is_same_v; static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; @@ -109,7 +120,7 @@ struct DeviceImageToColumnImpl const ck::index_t C, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, - const std::array& gemm_m_k_strides) + const std::array& gemm_g_m_k_strides) { const index_t NDoHoWo = N * ck::accumulate_n( @@ -117,11 +128,10 @@ struct DeviceImageToColumnImpl const index_t CZYX = C * ck::accumulate_n( filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); - const auto desc_mraw_kraw = make_naive_tensor_descriptor( - make_tuple(NDoHoWo, CZYX), make_tuple(gemm_m_k_strides[I0], gemm_m_k_strides[I1])); - const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw); - return desc_m_k; + const auto desc_mraw_kraw = make_naive_tensor_descriptor( + make_tuple(NDoHoWo, CZYX), make_tuple(gemm_g_m_k_strides[I1], gemm_g_m_k_strides[I2])); + return matrix_padder.PadADescriptor_M_K(desc_mraw_kraw); } using InputGridDesc = @@ -132,34 +142,38 @@ struct DeviceImageToColumnImpl decltype(BlockToCTileMap_M00_N0_M01Adapt( OutputGridDesc{}))>; - using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange; + using GridwiseTensorRearrangeKernel = + GridwiseTensorRearrange>; struct Argument : public BaseArgument { Argument(const void* p_in, // input image void* p_out, // gemm form + const ck::index_t G, const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, const std::array& image_g_n_c_wis_strides, - const std::array& gemm_m_k_strides, + const std::array& gemm_g_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, const std::array& input_right_pads) - : C_(C), + : G_(G), + C_(C), X_(filter_spatial_lengths[NDimSpatial - I1]), p_in_{static_cast(p_in)}, p_out_{static_cast(p_out)}, @@ -176,14 +190,16 @@ struct DeviceImageToColumnImpl filter_spatial_lengths, output_spatial_lengths, image_g_n_c_wis_strides, - conv_filter_strides, conv_filter_dilations, input_left_pads, input_right_pads); out_grid_desc_m_k_ = MakeOutDescriptor_M_K( - N, C, filter_spatial_lengths, output_spatial_lengths, gemm_m_k_strides); + N, C, filter_spatial_lengths, output_spatial_lengths, gemm_g_m_k_strides); + + compute_ptr_offset_of_batch_.BatchStrideA_ = image_g_n_c_wis_strides[I0]; + compute_ptr_offset_of_batch_.BatchStrideC_ = gemm_g_m_k_strides[I0]; } void Print() const @@ -192,6 +208,7 @@ struct DeviceImageToColumnImpl std::cout << out_grid_desc_m_k_ << std::endl; } + const ck::index_t G_; const ck::index_t C_; const ck::index_t X_; @@ -206,6 +223,8 @@ struct DeviceImageToColumnImpl InputGridDesc in_grid_desc_m_k_; OutputGridDesc out_grid_desc_m_k_; + + ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch_; }; struct Invoker : public BaseInvoker @@ -220,12 +239,14 @@ struct DeviceImageToColumnImpl const auto block_2_tile_map = BlockToCTileMap_M00_N0_M01Adapt( arg.out_grid_desc_m_k_); - const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_); - const auto kernel = kernel_tensor_rearrange, GridwiseTensorRearrangeKernel>; float elapsed_time = launch_and_time_kernel(stream_config, @@ -237,7 +258,9 @@ struct DeviceImageToColumnImpl arg.p_in_, arg.out_grid_desc_m_k_, arg.p_out_, - block_2_tile_map); + arg.G_, + block_2_tile_map, + arg.compute_ptr_offset_of_batch_); return elapsed_time; } @@ -250,9 +273,7 @@ struct DeviceImageToColumnImpl bool IsSupportedArgument(const Argument& arg) { - using namespace tensor_layout::convolution; - if constexpr(!(std::is_same_v || std::is_same_v || - std::is_same_v)) + if constexpr(!(is_NSpatialGC || is_GNSpatialC)) { return false; } @@ -295,13 +316,14 @@ struct DeviceImageToColumnImpl static auto MakeArgument(const void* p_in, // input image void* p_out, // gemm form + const ck::index_t G, const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, const std::array& image_g_n_c_wis_strides, - const std::array& gemm_m_k_strides, + const std::array& gemm_g_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, @@ -309,13 +331,14 @@ struct DeviceImageToColumnImpl { return Argument{static_cast(p_in), static_cast(p_out), + G, N, C, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, image_g_n_c_wis_strides, - gemm_m_k_strides, + gemm_g_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, @@ -327,13 +350,14 @@ struct DeviceImageToColumnImpl std::unique_ptr MakeArgumentPointer(const void* p_in, // input image void* p_out, // gemm form + const ck::index_t G, const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, const std::array& image_g_n_c_wis_strides, - const std::array& gemm_m_k_strides, + const std::array& gemm_g_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, @@ -341,13 +365,14 @@ struct DeviceImageToColumnImpl { return std::make_unique(static_cast(p_in), static_cast(p_out), + G, N, C, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, image_g_n_c_wis_strides, - gemm_m_k_strides, + gemm_g_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp index e4e47b4fae..f77ffff350 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp @@ -21,6 +21,7 @@ template __global__ void #if CK_USE_LAUNCH_BOUNDS @@ -30,13 +31,20 @@ __global__ void const InputDataType* __restrict__ p_in_global, const OutputGridDesc out_grid_desc, OutputDataType* __restrict__ p_out_global, - const Block2ETileMap block_2_tile_map) + const index_t batch_count, + const Block2ETileMap block_2_tile_map, + const ComputePtrOffsetOfStridedBatch compute_ptr_offset_of_batch) { #if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \ defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__)) - GridwiseTensorRearrangeKernel::Run( - in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map); + GridwiseTensorRearrangeKernel::Run(in_grid_desc, + p_in_global, + out_grid_desc, + p_out_global, + batch_count, + block_2_tile_map, + compute_ptr_offset_of_batch); #else ignore = in_grid_desc; ignore = p_in_global; @@ -56,7 +64,8 @@ template + typename Block2ETileMap, + typename ComputePtrOffsetOfStridedBatch> struct GridwiseTensorRearrange { @@ -69,7 +78,9 @@ struct GridwiseTensorRearrange const InputDataType* __restrict__ p_in_global, const OutputGridDesc& out_grid_desc, OutputDataType* __restrict__ p_out_global, - const Block2ETileMap& block_2_tile_map) + const index_t batch_count, + const Block2ETileMap& block_2_tile_map, + const ComputePtrOffsetOfStridedBatch& compute_ptr_offset_of_batch) { const auto block_work_idx = block_2_tile_map.CalculateBottomIndex(make_multi_index(get_block_1d_id())); @@ -80,12 +91,6 @@ struct GridwiseTensorRearrange const index_t k_block_data_idx_on_grid = __builtin_amdgcn_readfirstlane(block_work_idx[I1] * KPerBlock); - // Global Memory - const auto in_global_buf = make_dynamic_buffer( - p_in_global, in_grid_desc.GetElementSpaceSize()); - auto out_global_buf = make_dynamic_buffer( - p_out_global, out_grid_desc.GetElementSpaceSize()); - auto copy_global_to_global = ThreadGroupTensorSliceTransfer_v7, @@ -108,6 +113,22 @@ struct GridwiseTensorRearrange make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)), tensor_operation::element_wise::PassThrough{}}; + const index_t num_blocks_per_batch = + __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count); + const index_t g_idx = + __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch); + + // Global Memory + const index_t a_batch_offset = + __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetAPtrOffset(g_idx)); + const index_t c_batch_offset = + __builtin_amdgcn_readfirstlane(compute_ptr_offset_of_batch.GetCPtrOffset(g_idx)); + + const auto in_global_buf = make_dynamic_buffer( + p_in_global + a_batch_offset, in_grid_desc.GetElementSpaceSize()); + auto out_global_buf = make_dynamic_buffer( + p_out_global + c_batch_offset, out_grid_desc.GetElementSpaceSize()); + copy_global_to_global.Run( tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf)); } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp index 7ccfc6eb77..45e35ec56d 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp @@ -19,9 +19,7 @@ namespace host { * \brief Reference implementation for column to image. * * Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout. - * Memory layout is the same. * Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. - * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C]. * * \tparam NDimSpatial Number of spatial dimensions. * \tparam ImageLayout Image Layout. @@ -95,18 +93,19 @@ struct ReferenceColumnToImage : public device::BaseOperator float Run(const Argument& arg) { if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 && - arg.input_.GetNumOfDimension() == 2)) + arg.input_.GetNumOfDimension() == 3)) { throw std::runtime_error("wrong! inconsistent dimension"); } + const index_t G = arg.output_.GetLengths()[0]; const index_t N = arg.output_.GetLengths()[1]; const index_t C = arg.output_.GetLengths()[2]; if constexpr(NDimSpatial == 1) { const index_t Wo = arg.output_spatial_lengths_[0]; - auto func = [&](auto n) { + auto func = [&](auto g, auto n) { for(index_t wo = 0; wo < Wo; ++wo) { index_t row = n * Wo + wo; @@ -123,9 +122,10 @@ struct ReferenceColumnToImage : public device::BaseOperator if(wi >= 0 && ck::type_convert(wi) < arg.output_.GetLengths()[3]) { - float v_in = ck::type_convert(arg.input_(row, column)); - float v_out = ck::type_convert(arg.output_(0, n, c, wi)); - arg.output_(0, n, c, wi) = + float v_in = + ck::type_convert(arg.input_(g, row, column)); + float v_out = ck::type_convert(arg.output_(g, n, c, wi)); + arg.output_(g, n, c, wi) = ck::type_convert(v_in + v_out); } column++; @@ -134,7 +134,7 @@ struct ReferenceColumnToImage : public device::BaseOperator } }; - make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency()); return 0; } @@ -143,7 +143,7 @@ struct ReferenceColumnToImage : public device::BaseOperator const index_t Ho = arg.output_spatial_lengths_[0]; const index_t Wo = arg.output_spatial_lengths_[1]; - auto func = [&](auto n) { + auto func = [&](auto g, auto n) { for(index_t ho = 0; ho < Ho; ++ho) { for(index_t wo = 0; wo < Wo; ++wo) @@ -176,10 +176,10 @@ struct ReferenceColumnToImage : public device::BaseOperator arg.output_.GetLengths()[4]) { float v_in = - ck::type_convert(arg.input_(row, column)); + ck::type_convert(arg.input_(g, row, column)); float v_out = ck::type_convert( - arg.output_(0, n, c, hi, wi)); - arg.output_(0, n, c, hi, wi) = + arg.output_(g, n, c, hi, wi)); + arg.output_(g, n, c, hi, wi) = ck::type_convert(v_in + v_out); } column++; @@ -190,7 +190,7 @@ struct ReferenceColumnToImage : public device::BaseOperator } }; - make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency()); return 0; } @@ -200,7 +200,7 @@ struct ReferenceColumnToImage : public device::BaseOperator const index_t Ho = arg.output_spatial_lengths_[1]; const index_t Wo = arg.output_spatial_lengths_[2]; - auto func = [&](auto n) { + auto func = [&](auto g, auto n) { for(index_t d_o = 0; d_o < Do; ++d_o) { for(index_t ho = 0; ho < Ho; ++ho) @@ -245,10 +245,10 @@ struct ReferenceColumnToImage : public device::BaseOperator arg.output_.GetLengths()[5]) { float v_in = ck::type_convert( - arg.input_(row, column)); + arg.input_(g, row, column)); float v_out = ck::type_convert( - arg.output_(0, n, c, di, hi, wi)); - arg.output_(0, n, c, di, hi, wi) = + arg.output_(g, n, c, di, hi, wi)); + arg.output_(g, n, c, di, hi, wi) = ck::type_convert(v_in + v_out); } column++; @@ -261,7 +261,7 @@ struct ReferenceColumnToImage : public device::BaseOperator } }; - make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(func, G, N)(std::thread::hardware_concurrency()); return 0; } @@ -303,8 +303,9 @@ struct ReferenceColumnToImage : public device::BaseOperator C * ck::accumulate_n( arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); - if(!(arg.input_.GetLengths()[0] == static_cast(NDoHoWo) && - arg.input_.GetLengths()[1] == static_cast(CZYX))) + if(!(arg.input_.GetLengths()[0] == static_cast(G) && + arg.input_.GetLengths()[1] == static_cast(NDoHoWo) && + arg.input_.GetLengths()[2] == static_cast(CZYX))) { return false; } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp index 9e12d07844..56b0ce7914 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp @@ -19,9 +19,7 @@ namespace host { * \brief Reference implementation for image to column. * * Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. - * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C]. - * Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout. - * Memory layout is the same. + * Output tensor descriptor has [G * N * Do * Ho * Wo, Z * Y * X * C] data layout. * * \tparam NDimSpatial Number of spatial dimensions. * \tparam ImageLayout Image Layout. @@ -95,18 +93,19 @@ struct ReferenceImageToColumn : public device::BaseOperator float Run(const Argument& arg) { if(!(arg.input_.GetNumOfDimension() == NDimSpatial + 3 && - arg.output_.GetNumOfDimension() == 2)) + arg.output_.GetNumOfDimension() == 3)) { throw std::runtime_error("wrong! inconsistent dimension"); } + const index_t G = arg.input_.GetLengths()[0]; const index_t N = arg.input_.GetLengths()[1]; const index_t C = arg.input_.GetLengths()[2]; if constexpr(NDimSpatial == 1) { const index_t Wo = arg.output_spatial_lengths_[0]; - auto func = [&](auto n, auto wo) { + auto func = [&](auto g, auto n, auto wo) { index_t row = n * Wo + wo; index_t column = 0; @@ -121,15 +120,15 @@ struct ReferenceImageToColumn : public device::BaseOperator if(wi >= 0 && ck::type_convert(wi) < arg.input_.GetLengths()[3]) { - InDataType v_in = arg.input_(0, n, c, wi); - arg.output_(row, column) = ck::type_convert(v_in); + InDataType v_in = arg.input_(g, n, c, wi); + arg.output_(g, row, column) = ck::type_convert(v_in); } column++; } } }; - make_ParallelTensorFunctor(func, N, Wo)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency()); return 0; } @@ -138,7 +137,7 @@ struct ReferenceImageToColumn : public device::BaseOperator const index_t Ho = arg.output_spatial_lengths_[0]; const index_t Wo = arg.output_spatial_lengths_[1]; - auto func = [&](auto n, auto ho, auto wo) { + auto func = [&](auto g, auto n, auto ho, auto wo) { index_t row = n * Ho * Wo + ho * Wo + wo; index_t column = 0; @@ -162,8 +161,9 @@ struct ReferenceImageToColumn : public device::BaseOperator wi >= 0 && ck::type_convert(wi) < arg.input_.GetLengths()[4]) { - InDataType v_in = arg.input_(0, n, c, hi, wi); - arg.output_(row, column) = ck::type_convert(v_in); + InDataType v_in = arg.input_(g, n, c, hi, wi); + arg.output_(g, row, column) = + ck::type_convert(v_in); } column++; } @@ -171,7 +171,7 @@ struct ReferenceImageToColumn : public device::BaseOperator } }; - make_ParallelTensorFunctor(func, N, Ho, Wo)(std::thread::hardware_concurrency()); + make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency()); return 0; } @@ -181,7 +181,7 @@ struct ReferenceImageToColumn : public device::BaseOperator const index_t Ho = arg.output_spatial_lengths_[1]; const index_t Wo = arg.output_spatial_lengths_[2]; - auto func = [&](auto n, auto d_o, auto ho, auto wo) { + auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) { index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo; index_t column = 0; @@ -213,8 +213,8 @@ struct ReferenceImageToColumn : public device::BaseOperator ck::type_convert(wi) < arg.input_.GetLengths()[5]) { - InDataType v_in = arg.input_(0, n, c, di, hi, wi); - arg.output_(row, column) = + InDataType v_in = arg.input_(g, n, c, di, hi, wi); + arg.output_(g, row, column) = ck::type_convert(v_in); } column++; @@ -224,7 +224,7 @@ struct ReferenceImageToColumn : public device::BaseOperator } }; - make_ParallelTensorFunctor(func, N, Do, Ho, Wo)( + make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)( std::thread::hardware_concurrency()); return 0; @@ -267,8 +267,9 @@ struct ReferenceImageToColumn : public device::BaseOperator C * ck::accumulate_n( arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); - if(!(arg.output_.GetLengths()[0] == static_cast(NDoHoWo) && - arg.output_.GetLengths()[1] == static_cast(CZYX))) + if(!(arg.output_.GetLengths()[0] == static_cast(G) && + arg.output_.GetLengths()[1] == static_cast(NDoHoWo) && + arg.output_.GetLengths()[2] == static_cast(CZYX))) { return false; } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp index 57c0f88567..0be50a334a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp @@ -19,109 +19,214 @@ namespace instance { using namespace ck::conv_tensor_rearrange_op; +// GNWC/GNHWC/GNDHWC // Image to Column -// nhwc, 1d -void add_device_image_to_column_nwc_1d_bf16_instances( +// GNWC, 1d +void add_device_image_to_column_gnwc_1d_bf16_instances( std::vector>>& instances); -void add_device_image_to_column_nwc_1d_f16_instances( +void add_device_image_to_column_gnwc_1d_f16_instances( std::vector>>& instances); -void add_device_image_to_column_nwc_1d_f32_instances( +void add_device_image_to_column_gnwc_1d_f32_instances( std::vector>>& instances); -void add_device_image_to_column_nwc_1d_i8_instances( +void add_device_image_to_column_gnwc_1d_i8_instances( std::vector>>& instances); -// nhwc, 2d -void add_device_image_to_column_nhwc_2d_bf16_instances( +// GNHWC, 2d +void add_device_image_to_column_gnhwc_2d_bf16_instances( std::vector>>& instances); -void add_device_image_to_column_nhwc_2d_f16_instances( +void add_device_image_to_column_gnhwc_2d_f16_instances( std::vector>>& instances); -void add_device_image_to_column_nhwc_2d_f32_instances( +void add_device_image_to_column_gnhwc_2d_f32_instances( std::vector>>& instances); -void add_device_image_to_column_nhwc_2d_i8_instances( +void add_device_image_to_column_gnhwc_2d_i8_instances( std::vector< std::unique_ptr>>& instances); -// nhwc, 3d -void add_device_image_to_column_ndhwc_3d_bf16_instances( +// GNDHWC, 3d +void add_device_image_to_column_gndhwc_3d_bf16_instances( std::vector>>& instances); -void add_device_image_to_column_ndhwc_3d_f16_instances( +void add_device_image_to_column_gndhwc_3d_f16_instances( std::vector>>& instances); -void add_device_image_to_column_ndhwc_3d_f32_instances( +void add_device_image_to_column_gndhwc_3d_f32_instances( std::vector>>& instances); -void add_device_image_to_column_ndhwc_3d_i8_instances( +void add_device_image_to_column_gndhwc_3d_i8_instances( std::vector< std::unique_ptr>>& instances); // Column to Image -// nhwc, 1d -void add_device_column_to_image_nwc_1d_bf16_instances( +// GNWC, 1d +void add_device_column_to_image_gnwc_1d_bf16_instances( std::vector>>& instances); -void add_device_column_to_image_nwc_1d_f16_instances( +void add_device_column_to_image_gnwc_1d_f16_instances( std::vector>>& instances); -void add_device_column_to_image_nwc_1d_f32_instances( +void add_device_column_to_image_gnwc_1d_f32_instances( std::vector>>& instances); -void add_device_column_to_image_nwc_1d_i8_instances( +void add_device_column_to_image_gnwc_1d_i8_instances( std::vector>>& instances); -// nhwc, 2d -void add_device_column_to_image_nhwc_2d_bf16_instances( +// GNHWC, 2d +void add_device_column_to_image_gnhwc_2d_bf16_instances( std::vector>>& instances); -void add_device_column_to_image_nhwc_2d_f16_instances( +void add_device_column_to_image_gnhwc_2d_f16_instances( std::vector>>& instances); -void add_device_column_to_image_nhwc_2d_f32_instances( +void add_device_column_to_image_gnhwc_2d_f32_instances( std::vector>>& instances); -void add_device_column_to_image_nhwc_2d_i8_instances( +void add_device_column_to_image_gnhwc_2d_i8_instances( std::vector< std::unique_ptr>>& instances); -// nhwc, 3d -void add_device_column_to_image_ndhwc_3d_bf16_instances( +// GNDHWC, 3d +void add_device_column_to_image_gndhwc_3d_bf16_instances( std::vector>>& instances); -void add_device_column_to_image_ndhwc_3d_f16_instances( +void add_device_column_to_image_gndhwc_3d_f16_instances( std::vector>>& instances); -void add_device_column_to_image_ndhwc_3d_f32_instances( +void add_device_column_to_image_gndhwc_3d_f32_instances( std::vector>>& instances); -void add_device_column_to_image_ndhwc_3d_i8_instances( +void add_device_column_to_image_gndhwc_3d_i8_instances( std::vector< std::unique_ptr>>& instances); +// NWGC/NHWGC/NDHWGC +// Image to Column +// NWGC, 1d +void add_device_image_to_column_nwgc_1d_bf16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nwgc_1d_f16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nwgc_1d_f32_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nwgc_1d_i8_instances( + std::vector>>& + instances); +// NHWGC, 2d +void add_device_image_to_column_nhwgc_2d_bf16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nhwgc_2d_f16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nhwgc_2d_f32_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nhwgc_2d_i8_instances( + std::vector< + std::unique_ptr>>& + instances); +// NDHWGC, 3d +void add_device_image_to_column_ndhwgc_3d_bf16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_ndhwgc_3d_f16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_ndhwgc_3d_f32_instances( + std::vector>>& + instances); + +void add_device_image_to_column_ndhwgc_3d_i8_instances( + std::vector< + std::unique_ptr>>& + instances); + +// Column to Image +// NWGC, 1d +void add_device_column_to_image_nwgc_1d_bf16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nwgc_1d_f16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nwgc_1d_f32_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nwgc_1d_i8_instances( + std::vector>>& + instances); +// NHWGC, 2d +void add_device_column_to_image_nhwgc_2d_bf16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nhwgc_2d_f16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nhwgc_2d_f32_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nhwgc_2d_i8_instances( + std::vector< + std::unique_ptr>>& + instances); +// NDHWGC, 3d +void add_device_column_to_image_ndhwgc_3d_bf16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_ndhwgc_3d_f16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_ndhwgc_3d_f32_instances( + std::vector>>& + instances); + +void add_device_column_to_image_ndhwgc_3d_i8_instances( + std::vector< + std::unique_ptr>>& + instances); template && is_same_v) { - add_device_image_to_column_nwc_1d_f32_instances(op_ptrs); + add_device_image_to_column_gnwc_1d_f32_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_nwc_1d_f16_instances(op_ptrs); + add_device_image_to_column_gnwc_1d_f16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_nwc_1d_bf16_instances(op_ptrs); + add_device_image_to_column_gnwc_1d_bf16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_nwc_1d_i8_instances(op_ptrs); + add_device_image_to_column_gnwc_1d_i8_instances(op_ptrs); } } else if constexpr(NumDimSpatial == 2 && is_same_v) { if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_nhwc_2d_f32_instances(op_ptrs); + add_device_image_to_column_gnhwc_2d_f32_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_nhwc_2d_f16_instances(op_ptrs); + add_device_image_to_column_gnhwc_2d_f16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_nhwc_2d_bf16_instances(op_ptrs); + add_device_image_to_column_gnhwc_2d_bf16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_nhwc_2d_i8_instances(op_ptrs); + add_device_image_to_column_gnhwc_2d_i8_instances(op_ptrs); } } else if constexpr(NumDimSpatial == 3 && is_same_v) { if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_ndhwc_3d_f32_instances(op_ptrs); + add_device_image_to_column_gndhwc_3d_f32_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_ndhwc_3d_f16_instances(op_ptrs); + add_device_image_to_column_gndhwc_3d_f16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_ndhwc_3d_bf16_instances(op_ptrs); + add_device_image_to_column_gndhwc_3d_bf16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_image_to_column_ndhwc_3d_i8_instances(op_ptrs); + add_device_image_to_column_gndhwc_3d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 1 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nwgc_1d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nwgc_1d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_image_to_column_nwgc_1d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nwgc_1d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 2 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nhwgc_2d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nhwgc_2d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_image_to_column_nhwgc_2d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nhwgc_2d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 3 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_ndhwgc_3d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_ndhwgc_3d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_image_to_column_ndhwgc_3d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_ndhwgc_3d_i8_instances(op_ptrs); } } } @@ -214,60 +379,120 @@ struct DeviceOperationInstanceFactory< { if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_nwc_1d_f32_instances(op_ptrs); + add_device_column_to_image_gnwc_1d_f32_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_nwc_1d_f16_instances(op_ptrs); + add_device_column_to_image_gnwc_1d_f16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_nwc_1d_bf16_instances(op_ptrs); + add_device_column_to_image_gnwc_1d_bf16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_nwc_1d_i8_instances(op_ptrs); + add_device_column_to_image_gnwc_1d_i8_instances(op_ptrs); } } else if constexpr(NumDimSpatial == 2 && is_same_v) { if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_nhwc_2d_f32_instances(op_ptrs); + add_device_column_to_image_gnhwc_2d_f32_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_nhwc_2d_f16_instances(op_ptrs); + add_device_column_to_image_gnhwc_2d_f16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_nhwc_2d_bf16_instances(op_ptrs); + add_device_column_to_image_gnhwc_2d_bf16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_nhwc_2d_i8_instances(op_ptrs); + add_device_column_to_image_gnhwc_2d_i8_instances(op_ptrs); } } else if constexpr(NumDimSpatial == 3 && is_same_v) { if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_ndhwc_3d_f32_instances(op_ptrs); + add_device_column_to_image_gndhwc_3d_f32_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_ndhwc_3d_f16_instances(op_ptrs); + add_device_column_to_image_gndhwc_3d_f16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_ndhwc_3d_bf16_instances(op_ptrs); + add_device_column_to_image_gndhwc_3d_bf16_instances(op_ptrs); } else if constexpr(is_same_v && is_same_v) { - add_device_column_to_image_ndhwc_3d_i8_instances(op_ptrs); + add_device_column_to_image_gndhwc_3d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 1 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nwgc_1d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nwgc_1d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_column_to_image_nwgc_1d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nwgc_1d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 2 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nhwgc_2d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nhwgc_2d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_column_to_image_nhwgc_2d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nhwgc_2d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 3 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_ndhwgc_3d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_ndhwgc_3d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_column_to_image_ndhwgc_3d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_ndhwgc_3d_i8_instances(op_ptrs); } } } diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt index 5d1a554524..50855babb5 100644 --- a/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt @@ -1,5 +1,8 @@ add_instance_library(device_column_to_image_instance - device_column_to_image_nhwc_1d_instance.cpp - device_column_to_image_nhwc_2d_instance.cpp - device_column_to_image_nhwc_3d_instance.cpp + device_column_to_image_gnwc_1d_instance.cpp + device_column_to_image_gnhwc_2d_instance.cpp + device_column_to_image_gndhwc_3d_instance.cpp + device_column_to_image_nwgc_1d_instance.cpp + device_column_to_image_nhwgc_2d_instance.cpp + device_column_to_image_ndhwgc_3d_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gndhwc_3d_instance.cpp similarity index 87% rename from library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp rename to library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gndhwc_3d_instance.cpp index 9762b46c43..8de2311251 100644 --- a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gndhwc_3d_instance.cpp @@ -11,7 +11,7 @@ namespace instance { using namespace ck::conv_tensor_rearrange_op; -void add_device_column_to_image_ndhwc_3d_bf16_instances( +void add_device_column_to_image_gndhwc_3d_bf16_instances( std::vector>>& instances) { @@ -22,7 +22,7 @@ void add_device_column_to_image_ndhwc_3d_bf16_instances( #endif } -void add_device_column_to_image_ndhwc_3d_f16_instances( +void add_device_column_to_image_gndhwc_3d_f16_instances( std::vector>>& instances) { @@ -33,7 +33,7 @@ void add_device_column_to_image_ndhwc_3d_f16_instances( #endif } -void add_device_column_to_image_ndhwc_3d_f32_instances( +void add_device_column_to_image_gndhwc_3d_f32_instances( std::vector>>& instances) { @@ -44,7 +44,7 @@ void add_device_column_to_image_ndhwc_3d_f32_instances( #endif } -void add_device_column_to_image_ndhwc_3d_i8_instances( +void add_device_column_to_image_gndhwc_3d_i8_instances( std::vector< std::unique_ptr>>& instances) diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnhwc_2d_instance.cpp similarity index 88% rename from library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp rename to library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnhwc_2d_instance.cpp index 4de665a63f..611a43e452 100644 --- a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnhwc_2d_instance.cpp @@ -11,7 +11,7 @@ namespace instance { using namespace ck::conv_tensor_rearrange_op; -void add_device_column_to_image_nhwc_2d_bf16_instances( +void add_device_column_to_image_gnhwc_2d_bf16_instances( std::vector>>& instances) { @@ -22,7 +22,7 @@ void add_device_column_to_image_nhwc_2d_bf16_instances( #endif } -void add_device_column_to_image_nhwc_2d_f16_instances( +void add_device_column_to_image_gnhwc_2d_f16_instances( std::vector>>& instances) { @@ -33,7 +33,7 @@ void add_device_column_to_image_nhwc_2d_f16_instances( #endif } -void add_device_column_to_image_nhwc_2d_f32_instances( +void add_device_column_to_image_gnhwc_2d_f32_instances( std::vector>>& instances) { @@ -44,7 +44,7 @@ void add_device_column_to_image_nhwc_2d_f32_instances( #endif } -void add_device_column_to_image_nhwc_2d_i8_instances( +void add_device_column_to_image_gnhwc_2d_i8_instances( std::vector< std::unique_ptr>>& instances) diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnwc_1d_instance.cpp similarity index 88% rename from library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp rename to library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnwc_1d_instance.cpp index 8ba4d29775..c64a25df27 100644 --- a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_gnwc_1d_instance.cpp @@ -11,7 +11,7 @@ namespace instance { using namespace ck::conv_tensor_rearrange_op; -void add_device_column_to_image_nwc_1d_bf16_instances( +void add_device_column_to_image_gnwc_1d_bf16_instances( std::vector>>& instances) { @@ -22,7 +22,7 @@ void add_device_column_to_image_nwc_1d_bf16_instances( #endif } -void add_device_column_to_image_nwc_1d_f16_instances( +void add_device_column_to_image_gnwc_1d_f16_instances( std::vector>>& instances) { @@ -33,7 +33,7 @@ void add_device_column_to_image_nwc_1d_f16_instances( #endif } -void add_device_column_to_image_nwc_1d_f32_instances( +void add_device_column_to_image_gnwc_1d_f32_instances( std::vector>>& instances) { @@ -44,7 +44,7 @@ void add_device_column_to_image_nwc_1d_f32_instances( #endif } -void add_device_column_to_image_nwc_1d_i8_instances( +void add_device_column_to_image_gnwc_1d_i8_instances( std::vector>>& instances) { diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp new file mode 100644 index 0000000000..eef5260e64 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_ndhwgc_3d_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +void add_device_column_to_image_ndhwgc_3d_bf16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_BF16 + add_device_operation_instances(instances, device_column_to_image_bf16_instances<3, NDHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_ndhwgc_3d_f16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP16 + add_device_operation_instances(instances, device_column_to_image_f16_instances<3, NDHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_ndhwgc_3d_f32_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP32 + add_device_operation_instances(instances, device_column_to_image_f32_instances<3, NDHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_ndhwgc_3d_i8_instances( + std::vector< + std::unique_ptr>>& + instances) +{ +#ifdef CK_ENABLE_INT8 + add_device_operation_instances(instances, device_column_to_image_i8_instances<3, NDHWGC>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwgc_2d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwgc_2d_instance.cpp new file mode 100644 index 0000000000..910782ca7d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwgc_2d_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +void add_device_column_to_image_nhwgc_2d_bf16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_BF16 + add_device_operation_instances(instances, device_column_to_image_bf16_instances<2, NHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nhwgc_2d_f16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP16 + add_device_operation_instances(instances, device_column_to_image_f16_instances<2, NHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nhwgc_2d_f32_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP32 + add_device_operation_instances(instances, device_column_to_image_f32_instances<2, NHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nhwgc_2d_i8_instances( + std::vector< + std::unique_ptr>>& + instances) +{ +#ifdef CK_ENABLE_INT8 + add_device_operation_instances(instances, device_column_to_image_i8_instances<2, NHWGC>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nwgc_1d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nwgc_1d_instance.cpp new file mode 100644 index 0000000000..d806d91166 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nwgc_1d_instance.cpp @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +void add_device_column_to_image_nwgc_1d_bf16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_BF16 + add_device_operation_instances(instances, device_column_to_image_bf16_instances<1, NWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nwgc_1d_f16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP16 + add_device_operation_instances(instances, device_column_to_image_f16_instances<1, NWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nwgc_1d_f32_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP32 + add_device_operation_instances(instances, device_column_to_image_f32_instances<1, NWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nwgc_1d_i8_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_INT8 + add_device_operation_instances(instances, device_column_to_image_i8_instances<1, NWGC>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt index de10369374..9e52a8157f 100644 --- a/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/image_to_column/CMakeLists.txt @@ -1,5 +1,8 @@ add_instance_library(device_image_to_column_instance - device_image_to_column_nhwc_1d_instance.cpp - device_image_to_column_nhwc_2d_instance.cpp - device_image_to_column_nhwc_3d_instance.cpp + device_image_to_column_gnwc_1d_instance.cpp + device_image_to_column_gnhwc_2d_instance.cpp + device_image_to_column_gndhwc_3d_instance.cpp + device_image_to_column_nwgc_1d_instance.cpp + device_image_to_column_nhwgc_2d_instance.cpp + device_image_to_column_ndhwgc_3d_instance.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gndhwc_3d_instance.cpp similarity index 83% rename from library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp rename to library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gndhwc_3d_instance.cpp index 8290dae928..785019133e 100644 --- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gndhwc_3d_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" @@ -11,7 +11,7 @@ namespace instance { using namespace ck::conv_tensor_rearrange_op; -void add_device_image_to_column_ndhwc_3d_bf16_instances( +void add_device_image_to_column_gndhwc_3d_bf16_instances( std::vector>>& instances) { @@ -22,7 +22,7 @@ void add_device_image_to_column_ndhwc_3d_bf16_instances( #endif } -void add_device_image_to_column_ndhwc_3d_f16_instances( +void add_device_image_to_column_gndhwc_3d_f16_instances( std::vector>>& instances) { @@ -33,7 +33,7 @@ void add_device_image_to_column_ndhwc_3d_f16_instances( #endif } -void add_device_image_to_column_ndhwc_3d_f32_instances( +void add_device_image_to_column_gndhwc_3d_f32_instances( std::vector>>& instances) { @@ -44,7 +44,7 @@ void add_device_image_to_column_ndhwc_3d_f32_instances( #endif } -void add_device_image_to_column_ndhwc_3d_i8_instances( +void add_device_image_to_column_gndhwc_3d_i8_instances( std::vector< std::unique_ptr>>& instances) diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gnhwc_2d_instance.cpp similarity index 84% rename from library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp rename to library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gnhwc_2d_instance.cpp index 7115e75667..5125649c89 100644 --- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gnhwc_2d_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" @@ -11,7 +11,7 @@ namespace instance { using namespace ck::conv_tensor_rearrange_op; -void add_device_image_to_column_nhwc_2d_bf16_instances( +void add_device_image_to_column_gnhwc_2d_bf16_instances( std::vector>>& instances) { @@ -22,7 +22,7 @@ void add_device_image_to_column_nhwc_2d_bf16_instances( #endif } -void add_device_image_to_column_nhwc_2d_f16_instances( +void add_device_image_to_column_gnhwc_2d_f16_instances( std::vector>>& instances) { @@ -33,7 +33,7 @@ void add_device_image_to_column_nhwc_2d_f16_instances( #endif } -void add_device_image_to_column_nhwc_2d_f32_instances( +void add_device_image_to_column_gnhwc_2d_f32_instances( std::vector>>& instances) { @@ -44,7 +44,7 @@ void add_device_image_to_column_nhwc_2d_f32_instances( #endif } -void add_device_image_to_column_nhwc_2d_i8_instances( +void add_device_image_to_column_gnhwc_2d_i8_instances( std::vector< std::unique_ptr>>& instances) diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gnwc_1d_instance.cpp similarity index 84% rename from library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp rename to library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gnwc_1d_instance.cpp index 3a629f2346..92624d9b2c 100644 --- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_gnwc_1d_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. #include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" @@ -11,7 +11,7 @@ namespace instance { using namespace ck::conv_tensor_rearrange_op; -void add_device_image_to_column_nwc_1d_bf16_instances( +void add_device_image_to_column_gnwc_1d_bf16_instances( std::vector>>& instances) { @@ -22,7 +22,7 @@ void add_device_image_to_column_nwc_1d_bf16_instances( #endif } -void add_device_image_to_column_nwc_1d_f16_instances( +void add_device_image_to_column_gnwc_1d_f16_instances( std::vector>>& instances) { @@ -33,7 +33,7 @@ void add_device_image_to_column_nwc_1d_f16_instances( #endif } -void add_device_image_to_column_nwc_1d_f32_instances( +void add_device_image_to_column_gnwc_1d_f32_instances( std::vector>>& instances) { @@ -44,7 +44,7 @@ void add_device_image_to_column_nwc_1d_f32_instances( #endif } -void add_device_image_to_column_nwc_1d_i8_instances( +void add_device_image_to_column_gnwc_1d_i8_instances( std::vector>>& instances) { diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_ndhwgc_3d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_ndhwgc_3d_instance.cpp new file mode 100644 index 0000000000..89f33f356e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_ndhwgc_3d_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +void add_device_image_to_column_ndhwgc_3d_bf16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_BF16 + add_device_operation_instances(instances, device_image_to_column_bf16_instances<3, NDHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_image_to_column_ndhwgc_3d_f16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP16 + add_device_operation_instances(instances, device_image_to_column_f16_instances<3, NDHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_image_to_column_ndhwgc_3d_f32_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP32 + add_device_operation_instances(instances, device_image_to_column_f32_instances<3, NDHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_image_to_column_ndhwgc_3d_i8_instances( + std::vector< + std::unique_ptr>>& + instances) +{ +#ifdef CK_ENABLE_INT8 + add_device_operation_instances(instances, device_image_to_column_i8_instances<3, NDHWGC>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwgc_2d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwgc_2d_instance.cpp new file mode 100644 index 0000000000..eb0b5f14b0 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwgc_2d_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +void add_device_image_to_column_nhwgc_2d_bf16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_BF16 + add_device_operation_instances(instances, device_image_to_column_bf16_instances<2, NHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_image_to_column_nhwgc_2d_f16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP16 + add_device_operation_instances(instances, device_image_to_column_f16_instances<2, NHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_image_to_column_nhwgc_2d_f32_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP32 + add_device_operation_instances(instances, device_image_to_column_f32_instances<2, NHWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_image_to_column_nhwgc_2d_i8_instances( + std::vector< + std::unique_ptr>>& + instances) +{ +#ifdef CK_ENABLE_INT8 + add_device_operation_instances(instances, device_image_to_column_i8_instances<2, NHWGC>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nwgc_1d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nwgc_1d_instance.cpp new file mode 100644 index 0000000000..06723b4d04 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nwgc_1d_instance.cpp @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +void add_device_image_to_column_nwgc_1d_bf16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_BF16 + add_device_operation_instances(instances, device_image_to_column_bf16_instances<1, NWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_image_to_column_nwgc_1d_f16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP16 + add_device_operation_instances(instances, device_image_to_column_f16_instances<1, NWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_image_to_column_nwgc_1d_f32_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP32 + add_device_operation_instances(instances, device_image_to_column_f32_instances<1, NWGC>{}); +#else + ignore = instances; +#endif +} + +void add_device_image_to_column_nwgc_1d_i8_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_INT8 + add_device_operation_instances(instances, device_image_to_column_i8_instances<1, NWGC>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/README.md b/profiler/README.md index 98eae3a763..0c745bf8f2 100644 --- a/profiler/README.md +++ b/profiler/README.md @@ -194,7 +194,8 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate # 1: Input fp16, Weight fp16, Output fp16 # 2: Input bf16, Weight bf16, Output bf16 # 3: Input int8, Weight int8, Output int8) -# arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C]) +# arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Output[G * N * Ho * Wo, Y * X * C], +# 1: Input[N, Hi, Wi, G, C], Output[N * Ho * Wo * G, Y * X * C]) # arg4: verification (0: no, 1: yes) # arg5: initialization (0: no init, 1: integer value, 2: decimal value) # arg6: print tensor value (0: no; 1: yes) diff --git a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp index dc3d748bf6..fa0a771962 100644 --- a/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp +++ b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp @@ -93,6 +93,26 @@ static auto make_ref_op() } } +template +static auto create_gemm_desc(const ck::index_t G, const ck::index_t NDoHoWo, const ck::index_t CZYX) +{ + using namespace ck::tensor_layout::convolution; + if constexpr(std::is_same_v || std::is_same_v || + std::is_same_v) + { + return HostTensorDescriptor({G, NDoHoWo, CZYX}); + } + else if constexpr(std::is_same_v || std::is_same_v || + std::is_same_v) + { + return HostTensorDescriptor({G, NDoHoWo, CZYX}, {CZYX, CZYX * G, 1}); + } + else + { + throw std::runtime_error("Unsupported layout!"); + } +} + template ( conv_param); - const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX}); + const auto gemm_desc = create_gemm_desc(conv_param.G_, NDoHoWo, CZYX); std::array input_spatial_lengths{}; std::array filter_spatial_lengths{}; std::array output_spatial_lengths{}; std::array image_g_n_c_wis_strides{}; - std::array gemm_m_k_strides{}; + std::array gemm_g_m_k_strides{}; std::array conv_filter_strides{}; std::array conv_filter_dilations{}; std::array input_left_pads{}; @@ -134,7 +154,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_param.output_spatial_lengths_, output_spatial_lengths); copy(image_desc.GetStrides(), image_g_n_c_wis_strides); - copy(gemm_desc.GetStrides(), gemm_m_k_strides); + copy(gemm_desc.GetStrides(), gemm_g_m_k_strides); copy(conv_param.conv_filter_strides_, conv_filter_strides); copy(conv_param.conv_filter_dilations_, conv_filter_dilations); copy(conv_param.input_left_pads_, input_left_pads); @@ -212,13 +232,14 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, auto argument_ptr = op_ptr->MakeArgumentPointer( static_cast(in_device_buf.GetDeviceBuffer()), static_cast(out_device_buf.GetDeviceBuffer()), + conv_param.G_, conv_param.N_, conv_param.C_, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, image_g_n_c_wis_strides, - gemm_m_k_strides, + gemm_g_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, @@ -234,7 +255,7 @@ bool profile_conv_tensor_rearrange_impl(int do_verification, float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); std::size_t num_btype = - NDoHoWo * CZYX * (sizeof(OutputDataType) + sizeof(InputDataType)); + conv_param.G_ * NDoHoWo * CZYX * (sizeof(OutputDataType) + sizeof(InputDataType)); float gb_per_sec = num_btype / 1.E6 / avg_time; std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " << op_name << std::endl; diff --git a/profiler/src/profile_conv_tensor_rearrange.cpp b/profiler/src/profile_conv_tensor_rearrange.cpp index bad5ce40a5..6420698a28 100644 --- a/profiler/src/profile_conv_tensor_rearrange.cpp +++ b/profiler/src/profile_conv_tensor_rearrange.cpp @@ -19,7 +19,8 @@ enum struct RearrangeOp enum struct ConvLayout { - NHWC, // 0 + GNHWC, // 0 + NHWGC, // 1 }; enum struct DataType @@ -42,7 +43,8 @@ static void print_helper_msg() << " 1: Input fp16, Weight fp16, Output fp16\n" << " 2: Input bf16, Weight bf16, Output bf16\n" << " 3: Input int8, Weight int8, Output int8)\n" - << "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n" + << "arg3: tensor layout (0: Input[G, N, Hi, Wi, C], Output[G * N * Ho * Wo, Y * X * C],\n" + << " 1: Input[N, Hi, Wi, G, C], Output[N * Ho * Wo * G, Y * X * C])\n" << "arg4: verification (0: no, 1: yes)\n" << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" << "arg6: print tensor value (0: no; 1: yes)\n" @@ -114,11 +116,9 @@ int profile_conv_tensor_rearrange(int argc, char* argv[]) return pass ? 0 : 1; }; - // Image To Column if(rearrange_op == RearrangeOp::ImageToColumn) { - // NHWC - if(layout == ConvLayout::NHWC) + if(layout == ConvLayout::GNHWC) { if(num_dim_spatial == 1) { @@ -178,11 +178,70 @@ int profile_conv_tensor_rearrange(int argc, char* argv[]) } } } + else if(layout == ConvLayout::NHWGC) + { + if(num_dim_spatial == 1) + { + if(data_type == DataType::F32_F32) + { + return profile(I1, NWGC{}, F32{}, F32{}, ImageToColumn{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I1, NWGC{}, F16{}, F16{}, ImageToColumn{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I1, NWGC{}, BF16{}, BF16{}, ImageToColumn{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I1, NWGC{}, INT8{}, INT8{}, ImageToColumn{}); + } + } + else if(num_dim_spatial == 2) + { + if(data_type == DataType::F32_F32) + { + return profile(I2, NHWGC{}, F32{}, F32{}, ImageToColumn{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I2, NHWGC{}, F16{}, F16{}, ImageToColumn{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I2, NHWGC{}, BF16{}, BF16{}, ImageToColumn{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I2, NHWGC{}, INT8{}, INT8{}, ImageToColumn{}); + } + } + else if(num_dim_spatial == 3) + { + if(data_type == DataType::F32_F32) + { + return profile(I3, NDHWGC{}, F32{}, F32{}, ImageToColumn{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I3, NDHWGC{}, F16{}, F16{}, ImageToColumn{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I3, NDHWGC{}, BF16{}, BF16{}, ImageToColumn{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I3, NDHWGC{}, INT8{}, INT8{}, ImageToColumn{}); + } + } + } } else if(rearrange_op == RearrangeOp::ColumnToImage) { - // NHWC - if(layout == ConvLayout::NHWC) + if(layout == ConvLayout::GNHWC) { if(num_dim_spatial == 1) { @@ -242,6 +301,66 @@ int profile_conv_tensor_rearrange(int argc, char* argv[]) } } } + else if(layout == ConvLayout::NHWGC) + { + if(num_dim_spatial == 1) + { + if(data_type == DataType::F32_F32) + { + return profile(I1, NWGC{}, F32{}, F32{}, ColumnToImage{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I1, NWGC{}, F16{}, F16{}, ColumnToImage{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I1, NWGC{}, BF16{}, BF16{}, ColumnToImage{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I1, NWGC{}, INT8{}, INT8{}, ColumnToImage{}); + } + } + else if(num_dim_spatial == 2) + { + if(data_type == DataType::F32_F32) + { + return profile(I2, NHWGC{}, F32{}, F32{}, ColumnToImage{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I2, NHWGC{}, F16{}, F16{}, ColumnToImage{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I2, NHWGC{}, BF16{}, BF16{}, ColumnToImage{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I2, NHWGC{}, INT8{}, INT8{}, ColumnToImage{}); + } + } + else if(num_dim_spatial == 3) + { + if(data_type == DataType::F32_F32) + { + return profile(I3, NDHWGC{}, F32{}, F32{}, ColumnToImage{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I3, NDHWGC{}, F16{}, F16{}, ColumnToImage{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I3, NDHWGC{}, BF16{}, BF16{}, ColumnToImage{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I3, NDHWGC{}, INT8{}, INT8{}, ColumnToImage{}); + } + } + } } std::cout << "this data_type & layout is not implemented" << std::endl; diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp index 7065b03e0d..5cb8731b26 100644 --- a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp +++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp @@ -45,14 +45,20 @@ class TestConvTensorRearrange : public ::testing::Test using namespace ck::tensor_layout::convolution; using namespace ck::conv_tensor_rearrange_op; -using KernelTypes1d = - ::testing::Types, std::tuple>; +using KernelTypes1d = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; -using KernelTypes2d = - ::testing::Types, std::tuple>; +using KernelTypes2d = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; -using KernelTypes3d = - ::testing::Types, std::tuple>; +using KernelTypes3d = ::testing::Types, + std::tuple, + std::tuple, + std::tuple>; template class TestConvTensorRearrange1d : public TestConvTensorRearrange @@ -77,16 +83,16 @@ TYPED_TEST(TestConvTensorRearrange1d, Test1D) { this->conv_params.clear(); - this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}}); - this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}}); - this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {3}, {1}, {0}, {0}}); - this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}}); + this->conv_params.push_back({1, 2, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}}); + this->conv_params.push_back({1, 2, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}}); + this->conv_params.push_back({1, 2, 64, 1, 64, {1}, {7}, {3}, {1}, {0}, {0}}); + this->conv_params.push_back({1, 2, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}}); // ScalarPerVector should be 1 - this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}}); + this->conv_params.push_back({1, 2, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}}); // stride != 1 - this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}}); + this->conv_params.push_back({1, 2, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}}); // dilation != 1 - this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}}); + this->conv_params.push_back({1, 2, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}}); #ifdef CK_ENABLE_FP32 this->template Run<1, float, float>(); #endif @@ -106,13 +112,13 @@ TYPED_TEST(TestConvTensorRearrange2d, Test2D) this->conv_params.clear(); this->conv_params.push_back( - {2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + {2, 2, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); this->conv_params.push_back( - {2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + {2, 2, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}}); this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); this->conv_params.push_back( - {2, 1, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}}); + {2, 2, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}}); #ifdef CK_ENABLE_FP32 this->template Run<2, float, float>(); #endif @@ -131,13 +137,13 @@ TYPED_TEST(TestConvTensorRearrange3d, Test3D) { this->conv_params.clear(); this->conv_params.push_back( - {3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {3, 3, 3}, {0, 0, 0}, {0, 0, 0}}); + {3, 2, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {3, 3, 3}, {0, 0, 0}, {0, 0, 0}}); this->conv_params.push_back( - {3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + {3, 2, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); this->conv_params.push_back( - {3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + {3, 2, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); this->conv_params.push_back( - {3, 1, 64, 1, 64, {3, 3, 3}, {14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}}); + {3, 2, 64, 1, 64, {3, 3, 3}, {14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}}); #ifdef CK_ENABLE_FP32 this->template Run<3, float, float>(); #endif diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp index 57fcdc4269..67c0f2698c 100644 --- a/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp +++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp @@ -53,7 +53,7 @@ class TestConvTensorRearrangeInterface : public ::testing::Test template bool Run() { - + const auto G = conv_param.G_; const auto N = conv_param.N_; const auto C = conv_param.C_; const auto FakeC = @@ -71,13 +71,13 @@ class TestConvTensorRearrangeInterface : public ::testing::Test const auto image_desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed( conv_param); - const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX}); + const auto gemm_desc = HostTensorDescriptor({G, NDoHoWo, CZYX}); std::array input_spatial_lengths{}; std::array filter_spatial_lengths{}; std::array output_spatial_lengths{}; std::array input_g_n_c_wis_strides{}; - std::array output_m_k_strides{}; + std::array output_g_m_k_strides{}; std::array conv_filter_strides{}; std::array conv_filter_dilations{}; std::array input_left_pads{}; @@ -89,7 +89,7 @@ class TestConvTensorRearrangeInterface : public ::testing::Test copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_param.output_spatial_lengths_, output_spatial_lengths); copy(image_desc.GetStrides(), input_g_n_c_wis_strides); - copy(gemm_desc.GetStrides(), output_m_k_strides); + copy(gemm_desc.GetStrides(), output_g_m_k_strides); copy(conv_param.conv_filter_strides_, conv_filter_strides); copy(conv_param.conv_filter_dilations_, conv_filter_dilations); copy(conv_param.input_left_pads_, input_left_pads); @@ -100,13 +100,14 @@ class TestConvTensorRearrangeInterface : public ::testing::Test auto img2col = DeviceImgToColInstance{}; auto argument = img2col.MakeArgument(nullptr, nullptr, + G, N, IsCPacked ? C : FakeC, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, input_g_n_c_wis_strides, - output_m_k_strides, + output_g_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, @@ -119,13 +120,14 @@ class TestConvTensorRearrangeInterface : public ::testing::Test auto col2img = DeviceColToimgInstance{}; auto argument = col2img.MakeArgument(nullptr, nullptr, + G, N, IsCPacked ? C : FakeC, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, input_g_n_c_wis_strides, - output_m_k_strides, + output_g_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads,