From be5cb244c035dc29baa6013eae717301b3542f8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bart=C5=82omiej=20Kocot?= Date: Wed, 27 Sep 2023 17:19:06 +0200 Subject: [PATCH] Add column to image kernel (#930) * Add column to image kernel * Minor fixes for dtypes and client examples * Disable tests for disabled dtypes * Disable add instances functions for disabled data types * Minor stylistic fixes * Revert "Disable add instances functions for disabled data types" This reverts commit 728b86956378dcd9415fd0f2557833a068fe1c10. * Instances reduction * Add comments in device_column_to_image_impl * Update changelog and Copyrights * Improve changelog [ROCm/composable_kernel commit: e2243a4d1e579c46a57f18afae536a162ab0f4bf] --- CHANGELOG.md | 13 +- .../20_image_to_column/CMakeLists.txt | 2 - .../22_im2col_col2im/CMakeLists.txt | 5 + .../22_im2col_col2im/column_to_image.cpp | 173 +++++ .../image_to_column.cpp | 26 +- .../CMakeLists.txt | 6 +- .../52_im2col_col2im/column_to_image_f32.cpp | 165 +++++ .../common.hpp | 4 +- .../image_to_column_f32.cpp | 23 +- .../gpu/device/conv_tensor_rearrange_op.hpp | 33 + ...n.hpp => device_conv_tensor_rearrange.hpp} | 27 +- .../impl/device_column_to_image_impl.hpp | 621 ++++++++++++++++++ .../impl/device_image_to_column_impl.hpp | 144 ++-- ...lumn.hpp => gridwise_tensor_rearrange.hpp} | 75 ++- .../transform_conv_fwd_to_gemm.hpp | 345 +--------- include/ck/utility/dynamic_buffer.hpp | 34 +- .../cpu/reference_column_to_image.hpp | 363 ++++++++++ .../cpu/reference_image_to_column.hpp | 12 +- .../gpu/conv_tensor_rearrange.hpp | 282 ++++++++ .../device_column_to_image_instance.hpp | 106 +++ .../device_image_to_column_instance.hpp | 32 +- .../gpu/image_to_column.hpp | 135 ---- .../gpu/column_to_image/CMakeLists.txt | 5 + ...evice_column_to_image_nhwc_1d_instance.cpp | 61 ++ ...evice_column_to_image_nhwc_2d_instance.cpp | 62 ++ ...evice_column_to_image_nhwc_3d_instance.cpp | 62 ++ ...evice_image_to_column_nhwc_1d_instance.cpp | 40 +- ...evice_image_to_column_nhwc_2d_instance.cpp | 33 +- ...evice_image_to_column_nhwc_3d_instance.cpp | 41 +- profiler/README.md | 8 +- ...=> profile_conv_tensor_rearrange_impl.hpp} | 140 +++- profiler/src/CMakeLists.txt | 3 +- .../src/profile_conv_tensor_rearrange.cpp | 251 +++++++ profiler/src/profile_image_to_column.cpp | 169 ----- test/CMakeLists.txt | 2 +- test/conv_tensor_rearrange/CMakeLists.txt | 4 + .../test_conv_tensor_rearrange.cpp | 153 +++++ .../test_conv_tensor_rearrange_interface.cpp | 260 ++++++++ test/image_to_column/CMakeLists.txt | 4 - test/image_to_column/test_image_to_column.cpp | 121 ---- .../test_image_to_column_interface.cpp | 196 ------ 41 files changed, 3040 insertions(+), 1201 deletions(-) delete mode 100644 client_example/20_image_to_column/CMakeLists.txt create mode 100644 client_example/22_im2col_col2im/CMakeLists.txt create mode 100644 client_example/22_im2col_col2im/column_to_image.cpp rename client_example/{20_image_to_column => 22_im2col_col2im}/image_to_column.cpp (84%) rename example/{52_image_to_column => 52_im2col_col2im}/CMakeLists.txt (50%) create mode 100644 example/52_im2col_col2im/column_to_image_f32.cpp rename example/{52_image_to_column => 52_im2col_col2im}/common.hpp (94%) rename example/{52_image_to_column => 52_im2col_col2im}/image_to_column_f32.cpp (89%) create mode 100644 include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp rename include/ck/tensor_operation/gpu/device/{device_image_to_column.hpp => device_conv_tensor_rearrange.hpp} (74%) create mode 100644 include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp rename include/ck/tensor_operation/gpu/grid/{gridwise_image_to_column.hpp => gridwise_tensor_rearrange.hpp} (52%) create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp rename library/include/ck/library/tensor_operation_instance/gpu/{image_to_column => conv_tensor_rearrange}/device_image_to_column_instance.hpp (70%) delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp create mode 100644 library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt create mode 100644 library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp rename profiler/include/profiler/{profile_image_to_column_impl.hpp => profile_conv_tensor_rearrange_impl.hpp} (56%) create mode 100644 profiler/src/profile_conv_tensor_rearrange.cpp delete mode 100644 profiler/src/profile_image_to_column.cpp create mode 100644 test/conv_tensor_rearrange/CMakeLists.txt create mode 100644 test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp create mode 100644 test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp delete mode 100644 test/image_to_column/CMakeLists.txt delete mode 100644 test/image_to_column/test_image_to_column.cpp delete mode 100644 test/image_to_column/test_image_to_column_interface.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 31f129b581..9d38a66382 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,18 @@ Full documentation for Composable Kernel is not yet available. +## (Unreleased) CK for ROCm 6.0.0 + +### Fixed + +### Optimizations + +### Added +- Added image to column (#867) and column to image kernels (#930). + +### Changed + + ## CK 0.2.0 for ROCm 5.5.0 ### Fixed @@ -29,4 +41,3 @@ Full documentation for Composable Kernel is not yet available. - Added MaxPool backward (#750). ### Changed -- Changed ... diff --git a/client_example/20_image_to_column/CMakeLists.txt b/client_example/20_image_to_column/CMakeLists.txt deleted file mode 100644 index 80edcd0416..0000000000 --- a/client_example/20_image_to_column/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_executable(client_image_to_column image_to_column.cpp) -target_link_libraries(client_image_to_column PRIVATE composable_kernel::device_operations) diff --git a/client_example/22_im2col_col2im/CMakeLists.txt b/client_example/22_im2col_col2im/CMakeLists.txt new file mode 100644 index 0000000000..47ac42fe87 --- /dev/null +++ b/client_example/22_im2col_col2im/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(client_image_to_column image_to_column.cpp) +target_link_libraries(client_image_to_column PRIVATE composable_kernel::device_operations) + +add_executable(client_column_to_image column_to_image.cpp) +target_link_libraries(client_column_to_image PRIVATE composable_kernel::device_operations) diff --git a/client_example/22_im2col_col2im/column_to_image.cpp b/client_example/22_im2col_col2im/column_to_image.cpp new file mode 100644 index 0000000000..43338ce408 --- /dev/null +++ b/client_example/22_im2col_col2im/column_to_image.cpp @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp" +#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = ck::half_t; +using OutDataType = ck::half_t; + +using ImageLayout = ck::tensor_layout::convolution::GNHWC; + +static constexpr ck::index_t NumDimSpatial = 2; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 32; // batch size +static constexpr ck::index_t C = 32; // input channel (per group) +static constexpr ck::index_t Y = 3; // filter H +static constexpr ck::index_t X = 3; // filter W +static constexpr ck::index_t Hi = 28; // input H +static constexpr ck::index_t Wi = 28; // input W +static constexpr ck::index_t Ho = 28; // output H +static constexpr ck::index_t Wo = 28; // output W + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main() +{ + + std::array in_spatial_lengths{Hi, Wi}; + std::array wei_spatial_lengths{Y, X}; + std::array out_spatial_lengths{Ho, Wo}; + + // We have NHWGC in memory space (G is dummy) + // However, CK's API only accept length and stride with order of GNCHW + // Hence, we need to adjust the order of stride + std::array image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; + std::array gemm_strides{Y * X * C, 1}; + + std::array filter_strides{1, 1}; + std::array filter_dilations{1, 1}; + std::array input_left_pads{1, 1}; + std::array input_right_pads{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * N * Ho * Wo * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * N * Hi * Wi * G * C); + + using namespace ck::conv_tensor_rearrange_op; + + using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + out.GetDeviceBuffer(), + N, + C, + in_spatial_lengths, + out_spatial_lengths, + wei_spatial_lengths, + image_strides, + gemm_strides, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * G * C + + sizeof(OutDataType) * N * Ho * Wo * Y * X * C; + + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(avg_time < best_avg_time) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cerr << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance" << std::endl; + return EXIT_FAILURE; + } + + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_gb_per_sec + << " GB/s, " << best_op_name << std::endl; + + // run the best intance + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), + out.GetDeviceBuffer(), + N, + C, + in_spatial_lengths, + out_spatial_lengths, + wei_spatial_lengths, + image_strides, + gemm_strides, + filter_strides, + filter_dilations, + input_left_pads, + input_right_pads); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } +} diff --git a/client_example/20_image_to_column/image_to_column.cpp b/client_example/22_im2col_col2im/image_to_column.cpp similarity index 84% rename from client_example/20_image_to_column/image_to_column.cpp rename to client_example/22_im2col_col2im/image_to_column.cpp index ace4c1a681..a1447abf64 100644 --- a/client_example/20_image_to_column/image_to_column.cpp +++ b/client_example/22_im2col_col2im/image_to_column.cpp @@ -9,13 +9,14 @@ #include #include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp" +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp" +#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" using InDataType = ck::half_t; using OutDataType = ck::half_t; -using InLayout = ck::tensor_layout::convolution::GNHWC; +using ImageLayout = ck::tensor_layout::convolution::GNHWC; static constexpr ck::index_t NumDimSpatial = 2; static constexpr ck::index_t G = 1; @@ -54,8 +55,8 @@ int main() // We have NHWGC in memory space (G is dummy) // However, CK's API only accept length and stride with order of GNCHW // Hence, we need to adjust the order of stride - std::array in_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; - std::array out_strides{Y * X * C, 1}; + std::array image_strides{C, Hi * Wi * G * C, 1, Wi * G * C, G * C}; + std::array gemm_strides{Y * X * C, 1}; std::array filter_strides{1, 1}; std::array filter_dilations{1, 1}; @@ -65,8 +66,13 @@ int main() SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * G * C); SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * Y * X * C); - using DeviceOp = ck::tensor_operation::device:: - DeviceImageToColumn; + using namespace ck::conv_tensor_rearrange_op; + + using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange; // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< @@ -92,8 +98,8 @@ int main() in_spatial_lengths, out_spatial_lengths, wei_spatial_lengths, - in_strides, - out_strides, + image_strides, + gemm_strides, filter_strides, filter_dilations, input_left_pads, @@ -148,8 +154,8 @@ int main() in_spatial_lengths, out_spatial_lengths, wei_spatial_lengths, - in_strides, - out_strides, + image_strides, + gemm_strides, filter_strides, filter_dilations, input_left_pads, diff --git a/example/52_image_to_column/CMakeLists.txt b/example/52_im2col_col2im/CMakeLists.txt similarity index 50% rename from example/52_image_to_column/CMakeLists.txt rename to example/52_im2col_col2im/CMakeLists.txt index 226e1fc5ae..a2dec9e805 100644 --- a/example/52_image_to_column/CMakeLists.txt +++ b/example/52_im2col_col2im/CMakeLists.txt @@ -2,9 +2,11 @@ list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) set(target 0) foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) - add_custom_target(example_image_to_column) + add_custom_target(example_im2col_col2im) add_example_executable(example_image_to_column_f32 image_to_column_f32.cpp) - add_dependencies(example_image_to_column example_image_to_column_f32) + add_dependencies(example_im2col_col2im example_image_to_column_f32) + add_example_executable(example_column_to_image_f32 column_to_image_f32.cpp) + add_dependencies(example_im2col_col2im example_column_to_image_f32) set(target 1) endif() endforeach() diff --git a/example/52_im2col_col2im/column_to_image_f32.cpp b/example/52_im2col_col2im/column_to_image_f32.cpp new file mode 100644 index 0000000000..52144e6885 --- /dev/null +++ b/example/52_im2col_col2im/column_to_image_f32.cpp @@ -0,0 +1,165 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +using InDataType = FP32; // ck::bhalf_t;//FP32; +using OutDataType = FP32; // ck::bhalf_t;//FP32; + +using ImLayout = ck::tensor_layout::convolution::GNHWC; +using ColumnToImageOp = ck::conv_tensor_rearrange_op::ColumnToImage; + +// clang-format off +using DeviceColToImgInstance = ck::tensor_operation::device::DeviceColumnToImageImpl + //#####################| Num| ImLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| + //#####################| Dim| | | | Size| Block| Block| Cluster| Per| + //#####################| Spatial| | | | | | | Lengths| Vector| + //#####################| | | | | | | | | | + < NDimSpatial, ImLayout, InDataType, OutDataType, 256, 128, 128, S<16, 16>, 1>; +// clang-format on + +bool RunColumnToImage(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params) +{ + + const auto N = conv_params.N_; + const auto C = conv_params.C_; + + const ck::index_t NDoHoWo = + N * ck::accumulate_n( + conv_params.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); + const ck::index_t CZYX = + C * ck::accumulate_n( + conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); + + const auto in_desc = HostTensorDescriptor({NDoHoWo, CZYX}); + const auto out_desc = + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_params); + + std::array input_spatial_lengths{}; + std::array filter_spatial_lengths{}; + std::array output_spatial_lengths{}; + std::array image_g_n_c_wis_strides{}; + std::array gemm_m_k_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; + + copy(conv_params.input_spatial_lengths_, input_spatial_lengths); + copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths); + copy(conv_params.output_spatial_lengths_, output_spatial_lengths); + copy(in_desc.GetStrides(), gemm_m_k_strides); + copy(out_desc.GetStrides(), image_g_n_c_wis_strides); + copy(conv_params.conv_filter_strides_, conv_filter_strides); + copy(conv_params.conv_filter_dilations_, conv_filter_dilations); + copy(conv_params.input_left_pads_, input_left_pads); + copy(conv_params.input_right_pads_, input_right_pads); + + Tensor in(in_desc); + Tensor out_device(out_desc); + Tensor out_host(out_desc); + + std::cout << "in: " << in.mDesc << std::endl; + std::cout << "out: " << out_device.mDesc << std::endl; + + switch(config.init_method) + { + case 0: break; + case 1: in.GenerateTensorValue(GeneratorTensor_2{1, 2}); break; + default: in.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutDataType) * out_device.mDesc.GetElementSpaceSize()); + + in_device_buf.ToDevice(in.mData.data()); + + // reset input to zero + out_device_buf.SetZero(); + + static_assert(std::is_default_constructible_v); + + // do conv + auto col2img = DeviceColToImgInstance{}; + auto invoker = col2img.MakeInvoker(); + auto argument = col2img.MakeArgument(in_device_buf.GetDeviceBuffer(), + out_device_buf.GetDeviceBuffer(), + N, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + image_g_n_c_wis_strides, + gemm_m_k_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + if(!col2img.IsSupportedArgument(argument)) + { + std::cerr << "wrong! device_col2img with the specified compilation parameters does " + "not support this col2img problem" + << std::endl; + + return false; + } + + float ave_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); + std::size_t num_btype = NDoHoWo * CZYX * (sizeof(OutDataType) + sizeof(InDataType)); + float gb_per_sec = num_btype / 1.E6 / ave_time; + std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; + + if(config.do_verification) + { + auto ref_column_to_image = ck::tensor_operation::host:: + ReferenceColumnToImage(); + + auto ref_invoker = ref_column_to_image.MakeInvoker(); + + auto ref_argument = ref_column_to_image.MakeArgument(in, + out_host, + conv_params.filter_spatial_lengths_, + conv_params.conv_filter_strides_, + conv_params.conv_filter_dilations_, + conv_params.input_left_pads_, + conv_params.input_right_pads_); + + if(!ref_column_to_image.IsSupportedArgument(&ref_argument)) + { + std::cerr << "wrong! ref_col2img with the specified compilation parameters does " + "not support this col2img problem" + << std::endl; + return false; + } + + ref_invoker.Run(ref_argument); + out_device_buf.FromDevice(out_device.mData.data()); + return ck::utils::check_err(out_device.mData, out_host.mData); + } + + return true; +} + +int RunColumnToImageExample(int argc, char* argv[]) +{ + ExecutionConfig config; + ck::utils::conv::ConvParam conv_params = DefaultConvParams; + + if(!parse_cmd_args(argc, argv, config, conv_params)) + { + return EXIT_FAILURE; + } + + if(conv_params.num_dim_spatial_ != NDimSpatial) + { + std::cerr << "unsupported # of spatial dimensions" << std::endl; + return EXIT_FAILURE; + } + + return !RunColumnToImage(config, conv_params); +} + +int main(int argc, char* argv[]) { return RunColumnToImageExample(argc, argv); } diff --git a/example/52_image_to_column/common.hpp b/example/52_im2col_col2im/common.hpp similarity index 94% rename from example/52_image_to_column/common.hpp rename to example/52_im2col_col2im/common.hpp index 8510fa1e6d..61d30c4cb4 100644 --- a/example/52_image_to_column/common.hpp +++ b/example/52_im2col_col2im/common.hpp @@ -10,6 +10,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/library/utility/algorithm.hpp" @@ -20,6 +21,7 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp" template using S = ck::Sequence; @@ -32,7 +34,7 @@ struct ExecutionConfig final { bool do_verification = true; int init_method = 1; - bool time_kernel = true; + bool time_kernel = false; }; #define DefaultConvParams \ diff --git a/example/52_image_to_column/image_to_column_f32.cpp b/example/52_im2col_col2im/image_to_column_f32.cpp similarity index 89% rename from example/52_image_to_column/image_to_column_f32.cpp rename to example/52_im2col_col2im/image_to_column_f32.cpp index c8a7e5f221..6d883460cc 100644 --- a/example/52_image_to_column/image_to_column_f32.cpp +++ b/example/52_im2col_col2im/image_to_column_f32.cpp @@ -6,15 +6,16 @@ using InDataType = FP32; using OutDataType = FP32; -using InLayout = ck::tensor_layout::convolution::GNHWC; +using ImLayout = ck::tensor_layout::convolution::GNHWC; +using ImageToColumnOp = ck::conv_tensor_rearrange_op::ImageToColumn; // clang-format off using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl - //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| + //#####################| Num| ImLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| //#####################| Dim| | | | Size| Block| Block| Cluster| Per| //#####################| Spatial| | | | | | | Lengths| Vector| //#####################| | | | | | | | | | - < NDimSpatial, InLayout, InDataType, OutDataType, 256, 128, 128, S<16, 16>, 1>; + < NDimSpatial, ImLayout, InDataType, OutDataType, 256, 128, 128, S<16, 16>, 1>; // clang-format on bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::ConvParam& conv_params) @@ -31,14 +32,14 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv conv_params.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); const auto in_desc = - ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_params); + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_params); const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX}); std::array input_spatial_lengths{}; std::array filter_spatial_lengths{}; std::array output_spatial_lengths{}; - std::array input_g_n_c_wis_strides{}; - std::array output_m_k_strides{}; + std::array image_g_n_c_wis_strides{}; + std::array gemm_m_k_strides{}; std::array conv_filter_strides{}; std::array conv_filter_dilations{}; std::array input_left_pads{}; @@ -49,8 +50,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv copy(conv_params.input_spatial_lengths_, input_spatial_lengths); copy(conv_params.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_params.output_spatial_lengths_, output_spatial_lengths); - copy(in_desc.GetStrides(), input_g_n_c_wis_strides); - copy(out_desc.GetStrides(), output_m_k_strides); + copy(in_desc.GetStrides(), image_g_n_c_wis_strides); + copy(out_desc.GetStrides(), gemm_m_k_strides); copy(conv_params.conv_filter_strides_, conv_filter_strides); copy(conv_params.conv_filter_dilations_, conv_filter_dilations); copy(conv_params.input_left_pads_, input_left_pads); @@ -90,8 +91,8 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, - input_g_n_c_wis_strides, - output_m_k_strides, + image_g_n_c_wis_strides, + gemm_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, @@ -114,7 +115,7 @@ bool RunImageToColumn(const ExecutionConfig& config, const ck::utils::conv::Conv if(config.do_verification) { auto ref_image_to_column = ck::tensor_operation::host:: - ReferenceImageToColumn(); + ReferenceImageToColumn(); auto ref_invoker = ref_image_to_column.MakeInvoker(); diff --git a/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp b/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp new file mode 100644 index 0000000000..dc08a2c88b --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +namespace ck { +namespace conv_tensor_rearrange_op { + +struct BaseConvTensorRearrangeOp +{ +}; + +struct ImageToColumn : public BaseConvTensorRearrangeOp +{ + static constexpr const char* name = "Image to Column"; +}; + +struct ColumnToImage : public BaseConvTensorRearrangeOp +{ + static constexpr const char* name = "Column to Image"; +}; + +template ::value, + bool>::type = false> +std::ostream& operator<<(std::ostream& os, const BaseConvTensorRearrangeOp&) +{ + os << Op::name; + return os; +} + +} // namespace conv_tensor_rearrange_op +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp b/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp similarity index 74% rename from include/ck/tensor_operation/gpu/device/device_image_to_column.hpp rename to include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp index 631d5189dd..898cfe0f2c 100644 --- a/include/ck/tensor_operation/gpu/device/device_image_to_column.hpp +++ b/include/ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp @@ -12,21 +12,26 @@ namespace tensor_operation { namespace device { /** - * \brief Image to column. + * \brief Convolution Tensor Rearrange. * - * This Device operator converts image ([G, N, Di, Hi, Wi, C]) to the gemm - * problem([N * Do * Ho * Wo, Z * Y * X * C]). G must be equal to 1. + * This Device operator supports conversion image ([G, N, Di, Hi, Wi, C]) to + * the gemm problem([N * Do * Ho * Wo, Z * Y * X * C]) (Image to Column) and + * conversion gemm form to the image (Column to Image). + * + * Note that G must be equal to 1. * * \tparam NDimSpatial Number of spatial dimensions. - * \tparam InputLayout Input Layout. + * \tparam ImageLayout Input Layout. * \tparam InputDataType Input Data Type. * \tparam OutputDataType Output Data Type. + * \tparam ConvTensorRearrangeOp Operation type: ImageToColumn, ColumnToImage. */ template -struct DeviceImageToColumn : public BaseOperator + typename OutputDataType, + typename ConvTensorRearrangeOp> +struct DeviceConvTensorRearrange : public BaseOperator { /** @@ -39,8 +44,8 @@ struct DeviceImageToColumn : public BaseOperator * \param input_spatial_lengths Input spatial lengths. * \param filter_spatial_lengths Filter spatial lengths. * \param output_spatial_lengths Output spatial lengths. - * \param input_g_n_c_wis_strides Input strides in order [G, N, C, D, H, W]. - * \param output_m_k_strides Output strides. + * \param image_g_n_c_wis_strides Image strides in order [G, N, C, D, H, W]. + * \param gemm_m_k_strides Gemm form strides. * \param conv_filter_strides Convolution filter strides. * \param conv_filter_dilations Convolution filter dilations. * \param input_left_pads Convolution left pads. @@ -55,8 +60,8 @@ struct DeviceImageToColumn : public BaseOperator const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, - const std::array& input_g_n_c_wis_strides, - const std::array& output_m_k_strides, + const std::array& image_g_n_c_wis_strides, + const std::array& gemm_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp new file mode 100644 index 0000000000..f8b4a01681 --- /dev/null +++ b/include/ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp @@ -0,0 +1,621 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/tensor_description/tensor_descriptor.hpp" +#include "ck/tensor_description/tensor_descriptor_helper.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp" +#include "ck/host_utility/kernel_launch.hpp" +#include "ck/tensor_operation/gpu/device/convolution_backward_data_specialization.hpp" +#include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" + +#include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp" + +#include "ck/tensor_operation/gpu/device/matrix_padder.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" +#include "ck/host_utility/io.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { + +// Image to column for input layout NDHWC: +// input : image converted to the gemm problem [N * Do * Ho * Wo, Z * Y * X * C] +// output : image [N, Di, Hi, Wi, C] +template = 1 && NDimSpatial <= 3, bool>::type = false> +struct DeviceColumnToImageImpl + : public DeviceConvTensorRearrange +{ + + static constexpr auto I0 = Number<0>{}; + static constexpr auto I1 = Number<1>{}; + static constexpr auto I2 = Number<2>{}; + + static constexpr auto ZIdx = Number{}; + static constexpr auto YIdx = NDimSpatial == 1 ? I0 : Number{}; + static constexpr auto XIdx = Number{}; + + static constexpr auto spatial_offset = Number<3>{}; + + static constexpr auto conv_to_gemm_transformer = + TransformConvFwdToGemm{}; + static constexpr auto matrix_padder = + MatrixPadder{ + MPerBlock, 0 /* NPerBlock*/, KPerBlock}; + + // Calculate number of independent filters for given conv params + static index_t GetNumberOfIndependentFilters(const index_t input_spatial_len, + const index_t left_pad, + const index_t right_pad, + const index_t filter_len, + const index_t filter_stride, + const index_t filter_dilation, + const index_t image_offset) + { + const index_t x_eff = (filter_len - 1) * filter_dilation + 1; + const index_t next_filter_padded = + math::integer_divide_ceil(x_eff, filter_stride) * filter_stride; + // If filter_stride >= x_eff then each filter is independent + const index_t independent_filter_stride = + filter_stride >= x_eff ? filter_stride : next_filter_padded; + const index_t w_eff = input_spatial_len - image_offset + left_pad + right_pad - x_eff; + // There are no independent filters + if(w_eff < 0) + return 0; + const index_t independent_kernels_num = w_eff / independent_filter_stride + 1; + return independent_kernels_num; + } + + // Make column form descriptor + static auto + MakeInputDescriptor_M_K(const ck::index_t N, + const ck::index_t C, + const std::array& filter_spatial_lengths, + const std::array& output_spatial_lengths, + const std::array& conv_filter_strides, + const std::array& gemm_m_k_strides, + const std::array& independent_filters, + const std::array& effs) + { + const index_t DoHoWo = ck::accumulate_n( + output_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); + const index_t CZYX = + C * ck::accumulate_n( + filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); + + const index_t NStride = DoHoWo * gemm_m_k_strides[I0] * gemm_m_k_strides[I1]; + // Calculate the appropriate stride for each set of independent filters + // in each dimension + const index_t WStride = + math::integer_divide_ceil(effs[XIdx], conv_filter_strides[XIdx]) * gemm_m_k_strides[I0]; + const index_t HStride = math::integer_divide_ceil(effs[YIdx], conv_filter_strides[YIdx]) * + output_spatial_lengths[XIdx] * gemm_m_k_strides[I0]; + const index_t DStride = math::integer_divide_ceil(effs[ZIdx], conv_filter_strides[ZIdx]) * + output_spatial_lengths[YIdx] * output_spatial_lengths[XIdx] * + gemm_m_k_strides[I0]; + // Create descriptor for independent filters in each dimension and + // then merge them into column form + if constexpr(NDimSpatial == 1) + { + const auto desc_gemm_form = + make_naive_tensor_descriptor(make_tuple(N, independent_filters[XIdx], CZYX), + make_tuple(NStride, WStride, gemm_m_k_strides[I1])); + const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( + desc_gemm_form, + make_tuple(make_merge_transform(make_tuple(N, independent_filters[XIdx])), + make_pass_through_transform(CZYX)), + make_tuple(Sequence<0, 1>{}, Sequence<2>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_gemm_form_merged_filters); + return desc_m_k; + } + else if constexpr(NDimSpatial == 2) + { + const auto desc_gemm_form = make_naive_tensor_descriptor( + make_tuple(N, independent_filters[YIdx], independent_filters[XIdx], CZYX), + make_tuple(NStride, HStride, WStride, gemm_m_k_strides[I1])); + const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( + desc_gemm_form, + make_tuple(make_merge_transform( + make_tuple(N, independent_filters[YIdx], independent_filters[XIdx])), + make_pass_through_transform(CZYX)), + make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_gemm_form_merged_filters); + return desc_m_k; + } + else if constexpr(NDimSpatial == 3) + { + const auto desc_gemm_form = make_naive_tensor_descriptor( + make_tuple(N, + independent_filters[ZIdx], + independent_filters[YIdx], + independent_filters[XIdx], + CZYX), + make_tuple(NStride, DStride, HStride, WStride, gemm_m_k_strides[I1])); + const auto desc_gemm_form_merged_filters = transform_tensor_descriptor( + desc_gemm_form, + make_tuple(make_merge_transform(make_tuple(N, + independent_filters[ZIdx], + independent_filters[YIdx], + independent_filters[XIdx])), + make_pass_through_transform(CZYX)), + make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_gemm_form_merged_filters); + return desc_m_k; + } + } + + // Use MakeADescriptor_M_K from grouped convolution forward + static auto + MakeOutDescriptor_M_K(const ck::index_t N, + const ck::index_t C, + const std::array& input_spatial_lengths, + const std::array& filter_spatial_lengths, + const std::array& image_g_n_c_wis_strides, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads, + const std::array& image_offsets, + const std::array& independent_filters, + const std::array& effs) + { + std::array a_g_n_c_wis_lengths{1}; + std::array b_g_k_c_xs_lengths{1}; + std::array c_g_n_k_wos_lengths{1}; + + auto copy = [](const auto& x, auto& y, index_t dst_offset) { + std::copy(x.begin(), x.end(), y.begin() + dst_offset); + }; + + copy(input_spatial_lengths, a_g_n_c_wis_lengths, spatial_offset); + copy(filter_spatial_lengths, b_g_k_c_xs_lengths, spatial_offset); + // Calculate descriptor only for independent filters + copy(independent_filters, c_g_n_k_wos_lengths, spatial_offset); + + // fill only significant values (C and N) + a_g_n_c_wis_lengths[I1] = N; + a_g_n_c_wis_lengths[I2] = C; + b_g_k_c_xs_lengths[I2] = C; + c_g_n_k_wos_lengths[I1] = N; + + // Modify pads to apply offsets + std::array input_left_pads_with_offset; + for(index_t i = 0; i < NDimSpatial; i++) + { + input_left_pads_with_offset[i] = math::max(0, input_left_pads[i] - image_offsets[i]); + } + // Modify input spatial lengths to apply offsets + for(index_t i = 0; i < NDimSpatial; i++) + { + a_g_n_c_wis_lengths[i + spatial_offset] -= + math::max(0, image_offsets[i] - input_left_pads[i]); + } + + // Strides to next independent filters + std::array independent_filter_strides; + for(index_t i = 0; i < NDimSpatial; i++) + { + index_t independent_filter_stride = + math::integer_divide_ceil(effs[i], conv_filter_strides[i]) * conv_filter_strides[i]; + // If conv stride is greater than whole filter size, use conv stride + independent_filter_strides[i] = conv_filter_strides[i] >= effs[i] + ? conv_filter_strides[i] + : independent_filter_stride; + } + + // Calculate image form descriptor for the modified convolution problem + const auto in_gemmmraw_gemmkraw_desc = + conv_to_gemm_transformer.template MakeADescriptor_M_K( + a_g_n_c_wis_lengths, + image_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + {}, // not needed for A Descriptor + c_g_n_k_wos_lengths, + {}, // not needed for A Descriptor + // conv_filter_strides, + independent_filter_strides, + conv_filter_dilations, + input_left_pads_with_offset, + input_right_pads); + + const auto in_gemmm_gemmk_desc = + matrix_padder.PadADescriptor_M_K(in_gemmmraw_gemmkraw_desc); + return in_gemmm_gemmk_desc; + } + + using InputGridDesc = + remove_cvref_t; + using OutputGridDesc = remove_cvref_t; + + using Block2ETileMap = remove_cvref_t< + decltype(BlockToCTileMap_M00_N0_M01Adapt( + InputGridDesc{}))>; + + using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange; + + struct Argument : public BaseArgument + { + Argument(const void* p_in, // input image + void* p_out, // output image + const ck::index_t N, + const ck::index_t C, + const std::array& input_spatial_lengths, + const std::array& filter_spatial_lengths, + const std::array& output_spatial_lengths, + const std::array& image_g_n_c_wis_strides, + const std::array& gemm_m_k_strides, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads) + : C_(C), + X_(filter_spatial_lengths[NDimSpatial - I1]), + p_in_{static_cast(p_in)}, + p_out_{static_cast(p_out)}, + image_g_n_c_wis_strides_{image_g_n_c_wis_strides}, + conv_filter_strides_{conv_filter_strides}, + conv_filter_dilations_{conv_filter_dilations}, + input_left_pads_{input_left_pads}, + input_right_pads_{input_right_pads} + { + const index_t x_eff = + (filter_spatial_lengths[XIdx] - 1) * conv_filter_dilations[XIdx] + 1; + const index_t y_eff = + NDimSpatial < 2 + ? I1 + : (filter_spatial_lengths[YIdx] - 1) * conv_filter_dilations[YIdx] + 1; + const index_t z_eff = + NDimSpatial < 3 + ? I1 + : (filter_spatial_lengths[ZIdx] - 1) * conv_filter_dilations[ZIdx] + 1; + + // Iterate over sets of independent filters + for(int z_img_offset = 0; z_img_offset < z_eff; + z_img_offset += conv_filter_strides[ZIdx]) + { + for(int y_img_offset = 0; y_img_offset < y_eff; + y_img_offset += conv_filter_strides[YIdx]) + { + for(int x_img_offset = 0; x_img_offset < x_eff; + x_img_offset += conv_filter_strides[XIdx]) + { + + std::array image_offsets; + std::array effs; + // Calculate the starting offset for a given set of + // independent filters + if constexpr(NDimSpatial == 1) + { + image_offsets = {x_img_offset}; + effs = {x_eff}; + } + if constexpr(NDimSpatial == 2) + { + image_offsets = {y_img_offset, x_img_offset}; + effs = {y_eff, x_eff}; + } + else if constexpr(NDimSpatial == 3) + { + image_offsets = {z_img_offset, y_img_offset, x_img_offset}; + effs = {z_eff, y_eff, x_eff}; + } + + std::array independent_filters; + for(index_t i = 0; i < NDimSpatial; i++) + { + independent_filters[i] = + GetNumberOfIndependentFilters(input_spatial_lengths[i], + input_left_pads[i], + input_right_pads[i], + filter_spatial_lengths[i], + conv_filter_strides[i], + conv_filter_dilations[i], + image_offsets[i]); + } + const index_t independent_filters_acum = ck::accumulate_n( + independent_filters.begin(), NDimSpatial, 1, std::multiplies<>()); + if(independent_filters_acum <= 0) + continue; + + const auto in_grid_desc_m_k = + MakeInputDescriptor_M_K(N, + C, + filter_spatial_lengths, + output_spatial_lengths, + conv_filter_strides, + gemm_m_k_strides, + independent_filters, + effs); + const auto out_grid_desc_m_k = + MakeOutDescriptor_M_K(N, + C, + input_spatial_lengths, + filter_spatial_lengths, + image_g_n_c_wis_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + image_offsets, + independent_filters, + effs); + in_grid_desc_m_k_container_.push_back(in_grid_desc_m_k); + out_grid_desc_m_k_container_.push_back(out_grid_desc_m_k); + + const index_t x_idx = x_img_offset / conv_filter_strides[XIdx]; + const index_t y_idx = y_img_offset / conv_filter_strides[YIdx]; + const index_t z_idx = z_img_offset / conv_filter_strides[ZIdx]; + + const index_t x_offset_with_pad = + math::max(0, x_img_offset - input_left_pads[XIdx]); + const index_t y_offset_with_pad = + math::max(0, y_img_offset - input_left_pads[YIdx]); + const index_t z_offset_with_pad = + math::max(0, z_img_offset - input_left_pads[ZIdx]); + + // Memory offsets to next set of independent filters, + // move to independent filters in each dimension + const index_t in_offset = + x_idx * gemm_m_k_strides[0] + + y_idx * gemm_m_k_strides[0] * output_spatial_lengths[XIdx] + + z_idx * gemm_m_k_strides[0] * output_spatial_lengths[YIdx] * + output_spatial_lengths[XIdx]; + // Move to independent filters in appropriate dimensions + const index_t out_offset = + x_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + XIdx] + + y_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + YIdx] + + z_offset_with_pad * image_g_n_c_wis_strides[spatial_offset + ZIdx]; + + const InputDataType* p_in_with_offset = + static_cast(p_in) + in_offset; + OutputDataType* p_out_with_offset = + static_cast(p_out) + out_offset; + p_in_container_.push_back(p_in_with_offset); + p_out_container_.push_back(p_out_with_offset); + } + } + } + } + + void Print() const + { + for(std::size_t i = 0; i < in_grid_desc_m_k_container_.size(); i++) + { + std::cout << in_grid_desc_m_k_container_[i] << std::endl; + std::cout << out_grid_desc_m_k_container_[i] << std::endl; + } + } + + const ck::index_t C_; + const ck::index_t X_; + + const InputDataType* p_in_; + OutputDataType* p_out_; + + const std::array& image_g_n_c_wis_strides_; + const std::array& conv_filter_strides_; + const std::array& conv_filter_dilations_; + const std::array& input_left_pads_; + const std::array& input_right_pads_; + + std::vector in_grid_desc_m_k_container_; + std::vector out_grid_desc_m_k_container_; + + std::vector p_in_container_; + std::vector p_out_container_; + }; + + struct Invoker : public BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + if(stream_config.log_level_ > 0) + { + arg.Print(); + } + + float elapsed_time = 0.f; + const auto kernel = kernel_tensor_rearrange; + + // Execute each set of independent filters + for(std::size_t i = 0; i < arg.in_grid_desc_m_k_container_.size(); i++) + { + const auto block_2_tile_map = + BlockToCTileMap_M00_N0_M01Adapt( + arg.out_grid_desc_m_k_container_[i]); + const index_t grid_size = + block_2_tile_map.CalculateGridSize(arg.in_grid_desc_m_k_container_[i]); + elapsed_time += launch_and_time_kernel(stream_config, + kernel, + dim3(grid_size), + dim3(BlockSize), + 0, + arg.in_grid_desc_m_k_container_[i], + arg.p_in_container_[i], + arg.out_grid_desc_m_k_container_[i], + arg.p_out_container_[i], + block_2_tile_map); + } + return elapsed_time; + } + + float Run(const BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + } + }; + + bool IsSupportedArgument(const Argument& arg) + { + using namespace tensor_layout::convolution; + if constexpr(!(std::is_same_v || std::is_same_v || + std::is_same_v)) + { + return false; + } + + const auto w_pad_left = arg.input_left_pads_[NDimSpatial - I1]; + const auto w_pad_right = arg.input_right_pads_[NDimSpatial - I1]; + const auto dilation_x = arg.conv_filter_dilations_[NDimSpatial - I1]; + const auto stride_x = arg.conv_filter_strides_[NDimSpatial - I1]; + bool is_w_packed = arg.image_g_n_c_wis_strides_[NDimSpatial + I2] == arg.C_; + bool is_c_packed = arg.image_g_n_c_wis_strides_[I2] == 1; + + // check vector acces with c not packed + if(!is_c_packed && ScalarPerVector != 1) + return false; + // check vector access of filter window row (only C if C is not packed) + if(!is_w_packed && arg.C_ % ScalarPerVector != 0) + return false; + // check vector access of filter window row (X * C) + if(arg.X_ * arg.C_ % ScalarPerVector != 0) + return false; + // check vector access of pads (w_pad_left/w_pad_right * C) + if(w_pad_left * arg.C_ % ScalarPerVector != 0 || + w_pad_right * arg.C_ % ScalarPerVector != 0) + return false; + // check vector access of with stride and pad + if((w_pad_left != 0 || w_pad_right != 0) && stride_x > 1 && arg.C_ % ScalarPerVector != 0) + return false; + // check vector access of with dilation + if(dilation_x > 1 && arg.C_ % ScalarPerVector != 0) + return false; + + bool valid = true; + for(std::size_t i = 0; i < arg.in_grid_desc_m_k_container_.size(); i++) + { + valid &= GridwiseTensorRearrangeKernel::CheckValidity( + arg.in_grid_desc_m_k_container_[i], arg.out_grid_desc_m_k_container_[i]); + } + return valid; + } + + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument(const void* p_in, // input image + void* p_out, // output image + const ck::index_t N, + const ck::index_t C, + const std::array& input_spatial_lengths, + const std::array& filter_spatial_lengths, + const std::array& output_spatial_lengths, + const std::array& image_g_n_c_wis_strides, + const std::array& gemm_m_k_strides, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads) + { + return Argument{static_cast(p_in), + static_cast(p_out), + N, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + image_g_n_c_wis_strides, + gemm_m_k_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads}; + } + + static auto MakeInvoker() { return Invoker{}; } + + std::unique_ptr + MakeArgumentPointer(const void* p_in, // input image + void* p_out, // output image + const ck::index_t N, + const ck::index_t C, + const std::array& input_spatial_lengths, + const std::array& filter_spatial_lengths, + const std::array& output_spatial_lengths, + const std::array& image_g_n_c_wis_strides, + const std::array& gemm_m_k_strides, + const std::array& conv_filter_strides, + const std::array& conv_filter_dilations, + const std::array& input_left_pads, + const std::array& input_right_pads) override + { + return std::make_unique(static_cast(p_in), + static_cast(p_out), + N, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + image_g_n_c_wis_strides, + gemm_m_k_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + } + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "DeviceColumnToImage" + << "<" + << BlockSize << ", " + << MPerBlock << ", " + << KPerBlock << ", " + << ScalarPerVector + << ">"; + // clang-format on + + return str.str(); + } +}; + +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp index 19f126e66f..8b4ae7875c 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp @@ -5,64 +5,41 @@ #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" -#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp" +#include "ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp" #include "ck/host_utility/kernel_launch.hpp" #include "ck/tensor_operation/gpu/grid/block_to_ctile_map.hpp" #include "ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp" #include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" #include "ck/tensor_operation/gpu/device/matrix_padder.hpp" #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" #include "ck/host_utility/io.hpp" namespace ck { namespace tensor_operation { namespace device { -template -__global__ void -#if CK_USE_LAUNCH_BOUNDS - __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) -#endif - kernel_image_to_column(const InputGridDesc in_grid_desc, - const InputDataType* __restrict__ p_in_global, - const OutputGridDesc out_grid_desc, - OutputDataType* __restrict__ p_out_global, - const Block2ETileMap block_2_tile_map) -{ -#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ - defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \ - defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__)) - GridwiseImageToColumnKernel::Run( - in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map); -#else - ignore = in_grid_desc; - ignore = p_in_global; - ignore = out_grid_desc; - ignore = p_out_global; - ignore = block_2_tile_map; -#endif -} - // Image to column for input layout NDHWC: -// input : input image [N, Di, Hi, Wi, C], -// output : output image [N * Do * Ho * Wo, Z * Y * X * C] +// input : input image [N, Di, Hi, Wi, C] +// output : gemm form [N * Do * Ho * Wo, Z * Y * X * C] template + index_t ScalarPerVector, + typename std::enable_if= 1 && NDimSpatial <= 3, bool>::type = false> struct DeviceImageToColumnImpl - : public DeviceImageToColumn + : public DeviceConvTensorRearrange { static constexpr auto I0 = Number<0>{}; @@ -83,7 +60,7 @@ struct DeviceImageToColumnImpl const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, - const std::array& input_g_n_c_wis_strides, + const std::array& image_g_n_c_wis_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, @@ -110,9 +87,9 @@ struct DeviceImageToColumnImpl c_g_n_k_wos_lengths[I1] = N; const auto in_gemmmraw_gemmkraw_desc = - conv_to_gemm_transformer.template MakeADescriptor_M_K( + conv_to_gemm_transformer.template MakeADescriptor_M_K( a_g_n_c_wis_lengths, - input_g_n_c_wis_strides, + image_g_n_c_wis_strides, b_g_k_c_xs_lengths, {}, // not needed for A Descriptor c_g_n_k_wos_lengths, @@ -132,7 +109,7 @@ struct DeviceImageToColumnImpl const ck::index_t C, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, - const std::array& output_m_k_strides) + const std::array& gemm_m_k_strides) { const index_t NDoHoWo = N * ck::accumulate_n( @@ -141,7 +118,7 @@ struct DeviceImageToColumnImpl C * ck::accumulate_n( filter_spatial_lengths.begin(), NDimSpatial, 1, std::multiplies<>()); const auto desc_mraw_kraw = make_naive_tensor_descriptor( - make_tuple(NDoHoWo, CZYX), make_tuple(output_m_k_strides[I0], output_m_k_strides[I1])); + make_tuple(NDoHoWo, CZYX), make_tuple(gemm_m_k_strides[I0], gemm_m_k_strides[I1])); const auto desc_m_k = matrix_padder.PadADescriptor_M_K(desc_mraw_kraw); return desc_m_k; @@ -155,28 +132,29 @@ struct DeviceImageToColumnImpl decltype(BlockToCTileMap_M00_N0_M01Adapt( OutputGridDesc{}))>; - using GridwiseImageToColumnKernel = GridwiseImageToColumn; + using GridwiseTensorRearrangeKernel = GridwiseTensorRearrange; struct Argument : public BaseArgument { Argument(const void* p_in, // input image - void* p_out, // output image + void* p_out, // gemm form const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, - const std::array& input_g_n_c_wis_strides, - const std::array& output_m_k_strides, + const std::array& image_g_n_c_wis_strides, + const std::array& gemm_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, @@ -185,7 +163,7 @@ struct DeviceImageToColumnImpl X_(filter_spatial_lengths[NDimSpatial - I1]), p_in_{static_cast(p_in)}, p_out_{static_cast(p_out)}, - input_g_n_c_wis_strides_{input_g_n_c_wis_strides}, + image_g_n_c_wis_strides_{image_g_n_c_wis_strides}, conv_filter_strides_{conv_filter_strides}, conv_filter_dilations_{conv_filter_dilations}, input_left_pads_{input_left_pads}, @@ -197,7 +175,7 @@ struct DeviceImageToColumnImpl input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, - input_g_n_c_wis_strides, + image_g_n_c_wis_strides, conv_filter_strides, conv_filter_dilations, @@ -205,7 +183,7 @@ struct DeviceImageToColumnImpl input_right_pads); out_grid_desc_m_k_ = MakeOutDescriptor_M_K( - N, C, filter_spatial_lengths, output_spatial_lengths, output_m_k_strides); + N, C, filter_spatial_lengths, output_spatial_lengths, gemm_m_k_strides); } void Print() const @@ -220,7 +198,7 @@ struct DeviceImageToColumnImpl const InputDataType* p_in_; OutputDataType* p_out_; - const std::array& input_g_n_c_wis_strides_; + const std::array& image_g_n_c_wis_strides_; const std::array& conv_filter_strides_; const std::array& conv_filter_dilations_; const std::array& input_left_pads_; @@ -243,12 +221,12 @@ struct DeviceImageToColumnImpl BlockToCTileMap_M00_N0_M01Adapt( arg.out_grid_desc_m_k_); const index_t grid_size = block_2_tile_map.CalculateGridSize(arg.out_grid_desc_m_k_); - const auto kernel = kernel_image_to_column; + const auto kernel = kernel_tensor_rearrange; float elapsed_time = launch_and_time_kernel(stream_config, kernel, @@ -273,12 +251,8 @@ struct DeviceImageToColumnImpl bool IsSupportedArgument(const Argument& arg) { using namespace tensor_layout::convolution; - if(!(std::is_same_v || std::is_same_v || - std::is_same_v)) - { - return false; - } - if(!(NDimSpatial >= 1 && NDimSpatial <= 3)) + if constexpr(!(std::is_same_v || std::is_same_v || + std::is_same_v)) { return false; } @@ -287,8 +261,8 @@ struct DeviceImageToColumnImpl const auto w_pad_right = arg.input_right_pads_[NDimSpatial - I1]; const auto dilation_x = arg.conv_filter_dilations_[NDimSpatial - I1]; const auto stride_x = arg.conv_filter_strides_[NDimSpatial - I1]; - bool is_w_packed = arg.input_g_n_c_wis_strides_[NDimSpatial + I2] == arg.C_; - bool is_c_packed = arg.input_g_n_c_wis_strides_[I2] == 1; + bool is_w_packed = arg.image_g_n_c_wis_strides_[NDimSpatial + I2] == arg.C_; + bool is_c_packed = arg.image_g_n_c_wis_strides_[I2] == 1; // check vector acces with c not packed if(!is_c_packed && ScalarPerVector != 1) @@ -310,8 +284,8 @@ struct DeviceImageToColumnImpl if(dilation_x > 1 && arg.C_ % ScalarPerVector != 0) return false; - return GridwiseImageToColumnKernel::CheckValidity(arg.in_grid_desc_m_k_, - arg.out_grid_desc_m_k_); + return GridwiseTensorRearrangeKernel::CheckValidity(arg.in_grid_desc_m_k_, + arg.out_grid_desc_m_k_); } bool IsSupportedArgument(const BaseArgument* p_arg) override @@ -320,14 +294,14 @@ struct DeviceImageToColumnImpl } static auto MakeArgument(const void* p_in, // input image - void* p_out, // output image + void* p_out, // gemm form const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, - const std::array& input_g_n_c_wis_strides, - const std::array& output_m_k_strides, + const std::array& image_g_n_c_wis_strides, + const std::array& gemm_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, @@ -340,8 +314,8 @@ struct DeviceImageToColumnImpl input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, - input_g_n_c_wis_strides, - output_m_k_strides, + image_g_n_c_wis_strides, + gemm_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, @@ -352,14 +326,14 @@ struct DeviceImageToColumnImpl std::unique_ptr MakeArgumentPointer(const void* p_in, // input image - void* p_out, // output image + void* p_out, // gemm form const ck::index_t N, const ck::index_t C, const std::array& input_spatial_lengths, const std::array& filter_spatial_lengths, const std::array& output_spatial_lengths, - const std::array& input_g_n_c_wis_strides, - const std::array& output_m_k_strides, + const std::array& image_g_n_c_wis_strides, + const std::array& gemm_m_k_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, @@ -372,8 +346,8 @@ struct DeviceImageToColumnImpl input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, - input_g_n_c_wis_strides, - output_m_k_strides, + image_g_n_c_wis_strides, + gemm_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, diff --git a/include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp similarity index 52% rename from include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp rename to include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp index 93625a324e..e4e47b4fae 100644 --- a/include/ck/tensor_operation/gpu/grid/gridwise_image_to_column.hpp +++ b/include/ck/tensor_operation/gpu/grid/gridwise_tensor_rearrange.hpp @@ -16,6 +16,36 @@ namespace ck { +template +__global__ void +#if CK_USE_LAUNCH_BOUNDS + __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU) +#endif + kernel_tensor_rearrange(const InputGridDesc in_grid_desc, + const InputDataType* __restrict__ p_in_global, + const OutputGridDesc out_grid_desc, + OutputDataType* __restrict__ p_out_global, + const Block2ETileMap block_2_tile_map) +{ +#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx906__) || defined(__gfx908__) || \ + defined(__gfx90a__) || defined(__gfx940__) || defined(__gfx1030__) || defined(__gfx1100__) || \ + defined(__gfx1101__) || defined(__gfx1102__) || defined(__gfx941__) || defined(__gfx942__)) + GridwiseTensorRearrangeKernel::Run( + in_grid_desc, p_in_global, out_grid_desc, p_out_global, block_2_tile_map); +#else + ignore = in_grid_desc; + ignore = p_in_global; + ignore = out_grid_desc; + ignore = p_out_global; + ignore = block_2_tile_map; +#endif +} + template -struct GridwiseImageToColumn +struct GridwiseTensorRearrange { static constexpr auto I0 = Number<0>{}; @@ -55,27 +86,27 @@ struct GridwiseImageToColumn auto out_global_buf = make_dynamic_buffer( p_out_global, out_grid_desc.GetElementSpaceSize()); - auto copy_global_to_global = ThreadGroupTensorSliceTransfer_v7< - ThisThreadBlock, - Tuple, - Tuple, - decltype(tie(in_grid_desc)), - decltype(tie(out_grid_desc)), - tensor_operation::element_wise::PassThrough, - Sequence(InMemoryDataOperationEnum::Set)>, - Sequence, - ThreadClusterLengths, - Sequence<0, 1>, - Sequence<0, 1>, - I1, - ScalarPerVector, - Sequence, - Sequence>{ - in_grid_desc, - make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)), - out_grid_desc, - make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)), - tensor_operation::element_wise::PassThrough{}}; + auto copy_global_to_global = + ThreadGroupTensorSliceTransfer_v7, + Tuple, + decltype(tie(in_grid_desc)), + decltype(tie(out_grid_desc)), + tensor_operation::element_wise::PassThrough, + Sequence(DstInMemOp)>, + Sequence, + ThreadClusterLengths, + Sequence<0, 1>, + Sequence<0, 1>, + I1, + ScalarPerVector, + Sequence, + Sequence>{ + in_grid_desc, + make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)), + out_grid_desc, + make_tuple(make_multi_index(m_block_data_idx_on_grid, k_block_data_idx_on_grid)), + tensor_operation::element_wise::PassThrough{}}; copy_global_to_global.Run( tie(in_grid_desc), tie(in_global_buf), tie(out_grid_desc), tie(out_global_buf)); diff --git a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp index cee3d2825b..6f546f1d6d 100644 --- a/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp +++ b/include/ck/tensor_operation/operator_transform/transform_conv_fwd_to_gemm.hpp @@ -20,348 +20,13 @@ struct TransformConvFwdToGemm static constexpr auto I0 = Number<0>{}; static constexpr auto I1 = Number<1>{}; - template , - bool>::type = false> - static auto - MakeADescriptor_M_K(const std::array& a_g_n_c_wis_lengths, - const std::array& /* a_g_n_c_wis_strides */, - const std::array& b_g_k_c_xs_lengths, - const std::array& /* b_g_k_c_xs_strides */, - const std::array& c_g_n_k_wos_lengths, - const std::array& /* c_g_n_k_wos_strides */, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& input_right_pads) - { - const index_t N = a_g_n_c_wis_lengths[1]; - const index_t C = a_g_n_c_wis_lengths[2]; - - const index_t Wi = a_g_n_c_wis_lengths[3]; - - const index_t Wo = c_g_n_k_wos_lengths[3]; - - const index_t ConvStrideW = conv_filter_strides[0]; - - if constexpr(ConvForwardSpecialization == - device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) - { - const index_t NWo = - N * ck::accumulate_n( - c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>()); - - const auto in_gemmm_gemmk_desc = - make_naive_tensor_descriptor_packed(make_tuple(NWo, C)); - - return in_gemmm_gemmk_desc; - } - else if constexpr(ConvForwardSpecialization == - device::ConvolutionForwardSpecialization::Filter1x1Pad0) - { - const auto in_n_wi_c_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C)); - - const auto in_n_wo_c_desc = transform_tensor_descriptor( - in_n_wi_c_desc, - make_tuple(make_pass_through_transform(N), - make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - - const auto in_gemmm_gemmk_desc = transform_tensor_descriptor( - in_n_wo_c_desc, - make_tuple(make_merge_transform(make_tuple(N, Wo)), make_pass_through_transform(C)), - make_tuple(Sequence<0, 1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return in_gemmm_gemmk_desc; - } - else - { - const index_t X = b_g_k_c_xs_lengths[3]; - const index_t ConvDilationW = conv_filter_dilations[0]; - const index_t InLeftPadW = input_left_pads[0]; - const index_t InRightPadW = input_right_pads[0]; - - const auto in_n_wi_c_desc = make_naive_tensor_descriptor_packed(make_tuple(N, Wi, C)); - - const auto in_n_wip_c_desc = transform_tensor_descriptor( - in_n_wi_c_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Wi, InLeftPadW, InRightPadW), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{})); - - const auto in_n_x_wo_c_desc = transform_tensor_descriptor( - in_n_wip_c_desc, - make_tuple( - make_pass_through_transform(N), - make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3>{})); - - const auto in_gemmm_gemmk_desc = - transform_tensor_descriptor(in_n_x_wo_c_desc, - make_tuple(make_merge_transform(make_tuple(N, Wo)), - make_merge_transform(make_tuple(X, C))), - make_tuple(Sequence<0, 2>{}, Sequence<1, 3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return in_gemmm_gemmk_desc; - } - } - - template , - bool>::type = false> - static auto - MakeADescriptor_M_K(const std::array& a_g_n_c_wis_lengths, - const std::array& /* a_g_n_c_wis_strides */, - const std::array& b_g_k_c_xs_lengths, - const std::array& /* b_g_k_c_xs_strides */, - const std::array& c_g_n_k_wos_lengths, - const std::array& /* c_g_n_k_wos_strides */, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& input_right_pads) - { - const index_t N = a_g_n_c_wis_lengths[1]; - const index_t C = a_g_n_c_wis_lengths[2]; - - const index_t Hi = a_g_n_c_wis_lengths[3]; - const index_t Wi = a_g_n_c_wis_lengths[4]; - - const index_t Ho = c_g_n_k_wos_lengths[3]; - const index_t Wo = c_g_n_k_wos_lengths[4]; - - const index_t ConvStrideH = conv_filter_strides[0]; - const index_t ConvStrideW = conv_filter_strides[1]; - - if constexpr(ConvForwardSpecialization == - device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) - { - const index_t NHoWo = - N * ck::accumulate_n( - c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>()); - - const auto in_gemmm_gemmk_desc = - make_naive_tensor_descriptor_packed(make_tuple(NHoWo, C)); - - return in_gemmm_gemmk_desc; - } - else if constexpr(ConvForwardSpecialization == - device::ConvolutionForwardSpecialization::Filter1x1Pad0) - { - const auto in_n_hi_wi_c_desc = - make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C)); - - const auto in_n_ho_wo_c_desc = transform_tensor_descriptor( - in_n_hi_wi_c_desc, - make_tuple(make_pass_through_transform(N), - make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)), - make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto in_gemmm_gemmk_desc = - transform_tensor_descriptor(in_n_ho_wo_c_desc, - make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)), - make_pass_through_transform(C)), - make_tuple(Sequence<0, 1, 2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return in_gemmm_gemmk_desc; - } - else - { - const index_t Y = b_g_k_c_xs_lengths[3]; - const index_t X = b_g_k_c_xs_lengths[4]; - - const index_t ConvDilationH = conv_filter_dilations[0]; - const index_t ConvDilationW = conv_filter_dilations[1]; - - const index_t InLeftPadH = input_left_pads[0]; - const index_t InLeftPadW = input_left_pads[1]; - - const index_t InRightPadH = input_right_pads[0]; - const index_t InRightPadW = input_right_pads[1]; - - const auto in_n_hi_wi_c_desc = - make_naive_tensor_descriptor_packed(make_tuple(N, Hi, Wi, C)); - - const auto in_n_hip_wip_c_desc = transform_tensor_descriptor( - in_n_hi_wi_c_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Hi, InLeftPadH, InRightPadH), - make_pad_transform(Wi, InLeftPadW, InRightPadW), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{})); - - const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor( - in_n_hip_wip_c_desc, - make_tuple( - make_pass_through_transform(N), - make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)), - make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)), - make_pass_through_transform(C)), - make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}), - make_tuple(Sequence<0>{}, Sequence<1, 2>{}, Sequence<3, 4>{}, Sequence<5>{})); - - const auto in_gemmm_gemmk_desc = - transform_tensor_descriptor(in_n_y_ho_x_wo_c_desc, - make_tuple(make_merge_transform(make_tuple(N, Ho, Wo)), - make_merge_transform(make_tuple(Y, X, C))), - make_tuple(Sequence<0, 2, 4>{}, Sequence<1, 3, 5>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return in_gemmm_gemmk_desc; - } - } - - template , - bool>::type = false> - static auto - MakeADescriptor_M_K(const std::array& a_g_n_c_wis_lengths, - const std::array& /* a_g_n_c_wis_strides */, - const std::array& b_g_k_c_xs_lengths, - const std::array& /* b_g_k_c_xs_strides */, - const std::array& c_g_n_k_wos_lengths, - const std::array& /* c_g_n_k_wos_strides */, - const std::array& conv_filter_strides, - const std::array& conv_filter_dilations, - const std::array& input_left_pads, - const std::array& input_right_pads) - { - const index_t N = a_g_n_c_wis_lengths[1]; - const index_t C = a_g_n_c_wis_lengths[2]; - - const index_t Di = a_g_n_c_wis_lengths[3]; - const index_t Hi = a_g_n_c_wis_lengths[4]; - const index_t Wi = a_g_n_c_wis_lengths[5]; - - const index_t Do = c_g_n_k_wos_lengths[3]; - const index_t Ho = c_g_n_k_wos_lengths[4]; - const index_t Wo = c_g_n_k_wos_lengths[5]; - - const index_t ConvStrideD = conv_filter_strides[0]; - const index_t ConvStrideH = conv_filter_strides[1]; - const index_t ConvStrideW = conv_filter_strides[2]; - - if constexpr(ConvForwardSpecialization == - device::ConvolutionForwardSpecialization::Filter1x1Stride1Pad0) - { - const index_t NDoHoWo = - N * ck::accumulate_n( - c_g_n_k_wos_lengths.begin() + 3, NDimSpatial, 1, std::multiplies<>()); - - const auto in_gemmm_gemmk_desc = - make_naive_tensor_descriptor_packed(make_tuple(NDoHoWo, C)); - - return in_gemmm_gemmk_desc; - } - else if constexpr(ConvForwardSpecialization == - device::ConvolutionForwardSpecialization::Filter1x1Pad0) - { - const auto in_n_di_hi_wi_c_desc = - make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C)); - - const auto in_n_do_ho_wo_c_desc = transform_tensor_descriptor( - in_n_di_hi_wi_c_desc, - make_tuple(make_pass_through_transform(N), - make_embed_transform(make_tuple(Do), make_tuple(ConvStrideD)), - make_embed_transform(make_tuple(Ho), make_tuple(ConvStrideH)), - make_embed_transform(make_tuple(Wo), make_tuple(ConvStrideW)), - make_pass_through_transform(C)), - make_tuple( - Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), - make_tuple( - Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); - - const auto in_gemmm_gemmk_desc = transform_tensor_descriptor( - in_n_do_ho_wo_c_desc, - make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)), - make_pass_through_transform(C)), - make_tuple(Sequence<0, 1, 2, 3>{}, Sequence<4>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return in_gemmm_gemmk_desc; - } - else - { - const index_t Z = b_g_k_c_xs_lengths[3]; - const index_t Y = b_g_k_c_xs_lengths[4]; - const index_t X = b_g_k_c_xs_lengths[5]; - - const index_t ConvDilationD = conv_filter_dilations[0]; - const index_t ConvDilationH = conv_filter_dilations[1]; - const index_t ConvDilationW = conv_filter_dilations[2]; - - const index_t InLeftPadD = input_left_pads[0]; - const index_t InLeftPadH = input_left_pads[1]; - const index_t InLeftPadW = input_left_pads[2]; - - const index_t InRightPadD = input_right_pads[0]; - const index_t InRightPadH = input_right_pads[1]; - const index_t InRightPadW = input_right_pads[2]; - - const auto in_n_di_hi_wi_c_desc = - make_naive_tensor_descriptor_packed(make_tuple(N, Di, Hi, Wi, C)); - - const auto in_n_hip_wip_c_desc = transform_tensor_descriptor( - in_n_di_hi_wi_c_desc, - make_tuple(make_pass_through_transform(N), - make_pad_transform(Di, InLeftPadD, InRightPadD), - make_pad_transform(Hi, InLeftPadH, InRightPadH), - make_pad_transform(Wi, InLeftPadW, InRightPadW), - make_pass_through_transform(C)), - make_tuple( - Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), - make_tuple( - Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{})); - - const auto in_n_z_do_y_ho_x_wo_c_desc = transform_tensor_descriptor( - in_n_hip_wip_c_desc, - make_tuple( - make_pass_through_transform(N), - make_embed_transform(make_tuple(Z, Do), make_tuple(ConvDilationD, ConvStrideD)), - make_embed_transform(make_tuple(Y, Ho), make_tuple(ConvDilationH, ConvStrideH)), - make_embed_transform(make_tuple(X, Wo), make_tuple(ConvDilationW, ConvStrideW)), - make_pass_through_transform(C)), - make_tuple( - Sequence<0>{}, Sequence<1>{}, Sequence<2>{}, Sequence<3>{}, Sequence<4>{}), - make_tuple(Sequence<0>{}, - Sequence<1, 2>{}, - Sequence<3, 4>{}, - Sequence<5, 6>{}, - Sequence<7>{})); - - const auto in_gemmm_gemmk_desc = transform_tensor_descriptor( - in_n_z_do_y_ho_x_wo_c_desc, - make_tuple(make_merge_transform(make_tuple(N, Do, Ho, Wo)), - make_merge_transform(make_tuple(Z, Y, X, C))), - make_tuple(Sequence<0, 2, 4, 6>{}, Sequence<1, 3, 5, 7>{}), - make_tuple(Sequence<0>{}, Sequence<1>{})); - - return in_gemmm_gemmk_desc; - } - } - // TODO: implement ck::tensor_layout::convolution that describe packed/strided dimemsion as // properties template || - is_same_v), + is_same_v || + is_same_v), bool>::type = false> static auto MakeADescriptor_M_K(const std::array& a_g_n_c_wis_lengths, @@ -473,7 +138,8 @@ struct TransformConvFwdToGemm template || - is_same_v), + is_same_v || + is_same_v), bool>::type = false> static auto MakeADescriptor_M_K(const std::array& a_g_n_c_wis_lengths, @@ -601,7 +267,8 @@ struct TransformConvFwdToGemm template || - is_same_v), + is_same_v || + is_same_v), bool>::type = false> static auto MakeADescriptor_M_K(const std::array& a_g_n_c_wis_lengths, diff --git a/include/ck/utility/dynamic_buffer.hpp b/include/ck/utility/dynamic_buffer.hpp index 02d61f34ed..85756cd0d1 100644 --- a/include/ck/utility/dynamic_buffer.hpp +++ b/include/ck/utility/dynamic_buffer.hpp @@ -140,10 +140,36 @@ struct DynamicBuffer } else if constexpr(Op == InMemoryDataOperationEnum::Add) { - auto tmp = this->template Get(i, is_valid_element); - this->template Set(i, is_valid_element, x + tmp); - // tmp += x; - // this->template Set(i, is_valid_element, tmp); + auto tmp = this->template Get(i, is_valid_element); + using scalar_t = typename scalar_type>::type; + // handle bfloat addition + if constexpr(is_same_v) + { + if constexpr(is_scalar_type::value) + { + // Scalar type + auto result = + type_convert(type_convert(x) + type_convert(tmp)); + this->template Set(i, is_valid_element, result); + } + else + { + // Vector type + constexpr auto vector_size = scalar_type>::vector_size; + const vector_type a_vector{tmp}; + const vector_type b_vector{x}; + static_for<0, vector_size, 1>{}([&](auto idx) { + auto result = type_convert( + type_convert(a_vector.template AsType()[idx]) + + type_convert(b_vector.template AsType()[idx])); + this->template Set(i + idx, is_valid_element, result); + }); + } + } + else + { + this->template Set(i, is_valid_element, x + tmp); + } } } diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp new file mode 100644 index 0000000000..7ccfc6eb77 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp @@ -0,0 +1,363 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include + +#include "ck/tensor_operation/gpu/device/device_base.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/library/utility/host_tensor.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +/** + * \brief Reference implementation for column to image. + * + * Input tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout. + * Memory layout is the same. + * Output tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. + * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C]. + * + * \tparam NDimSpatial Number of spatial dimensions. + * \tparam ImageLayout Image Layout. + * \tparam InDataType Input Data Type. + * \tparam OutDataType Output Data Type. + */ +template = 1 && NDimSpatial <= 3, bool>::type = false> +struct ReferenceColumnToImage : public device::BaseOperator +{ + // Argument + struct Argument : public device::BaseArgument + { + public: + Argument(const Tensor& input, + Tensor& output, + std::vector filter_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) + : input_{input}, + output_{output}, + conv_strides_{conv_filter_strides}, + conv_dilations_{conv_filter_dilations}, + in_left_pads_{input_left_pads}, + in_right_pads_{input_right_pads}, + filter_spatial_lengths_{filter_spatial_lengths} + { + initOutputSpatialLengths(); + } + + const Tensor& input_; + Tensor& output_; + + std::vector conv_strides_; + std::vector conv_dilations_; + std::vector in_left_pads_; + std::vector in_right_pads_; + + std::vector filter_spatial_lengths_; + std::vector output_spatial_lengths_; + + private: + void initOutputSpatialLengths() + { + constexpr auto input_offset_to_spatial = 3; + + for(ck::index_t i = 0; i < NDimSpatial; ++i) + { + // XEff = (X - 1) * conv_dilation_w + 1; + // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + const ck::index_t x_eff = (filter_spatial_lengths_[i] - 1) * conv_dilations_[i] + 1; + + output_spatial_lengths_.push_back( + (output_.GetLengths()[i + input_offset_to_spatial] + in_left_pads_[i] + + in_right_pads_[i] - x_eff) / + conv_strides_[i] + + 1); + } + } + }; + + struct Invoker : public device::BaseInvoker + { + using Argument = ReferenceColumnToImage::Argument; + + float Run(const Argument& arg) + { + if(!(arg.output_.GetNumOfDimension() == NDimSpatial + 3 && + arg.input_.GetNumOfDimension() == 2)) + { + throw std::runtime_error("wrong! inconsistent dimension"); + } + + const index_t N = arg.output_.GetLengths()[1]; + const index_t C = arg.output_.GetLengths()[2]; + + if constexpr(NDimSpatial == 1) + { + const index_t Wo = arg.output_spatial_lengths_[0]; + auto func = [&](auto n) { + for(index_t wo = 0; wo < Wo; ++wo) + { + index_t row = n * Wo + wo; + index_t column = 0; + + for(index_t x = 0; x < arg.filter_spatial_lengths_[0]; ++x) + { + auto wi = static_cast(wo * arg.conv_strides_[0]) + + static_cast(x * arg.conv_dilations_[0]) - + static_cast(arg.in_left_pads_[0]); + + for(index_t c = 0; c < C; ++c) + { + if(wi >= 0 && + ck::type_convert(wi) < arg.output_.GetLengths()[3]) + { + float v_in = ck::type_convert(arg.input_(row, column)); + float v_out = ck::type_convert(arg.output_(0, n, c, wi)); + arg.output_(0, n, c, wi) = + ck::type_convert(v_in + v_out); + } + column++; + } + } + } + }; + + make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); + + return 0; + } + else if constexpr(NDimSpatial == 2) + { + const index_t Ho = arg.output_spatial_lengths_[0]; + const index_t Wo = arg.output_spatial_lengths_[1]; + + auto func = [&](auto n) { + for(index_t ho = 0; ho < Ho; ++ho) + { + for(index_t wo = 0; wo < Wo; ++wo) + { + index_t row = n * Ho * Wo + ho * Wo + wo; + index_t column = 0; + + for(index_t y = 0; y < arg.filter_spatial_lengths_[0]; ++y) + { + auto hi = + static_cast(ho * arg.conv_strides_[0]) + + static_cast(y * arg.conv_dilations_[0]) - + static_cast(arg.in_left_pads_[0]); + + for(index_t x = 0; x < arg.filter_spatial_lengths_[1]; ++x) + { + auto wi = + static_cast(wo * arg.conv_strides_[1]) + + static_cast(x * arg.conv_dilations_[1]) - + static_cast(arg.in_left_pads_[1]); + + for(index_t c = 0; c < C; ++c) + { + + if(hi >= 0 && + ck::type_convert(hi) < + arg.output_.GetLengths()[3] && + wi >= 0 && + ck::type_convert(wi) < + arg.output_.GetLengths()[4]) + { + float v_in = + ck::type_convert(arg.input_(row, column)); + float v_out = ck::type_convert( + arg.output_(0, n, c, hi, wi)); + arg.output_(0, n, c, hi, wi) = + ck::type_convert(v_in + v_out); + } + column++; + } + } + } + } + } + }; + + make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); + + return 0; + } + else if constexpr(NDimSpatial == 3) + { + const index_t Do = arg.output_spatial_lengths_[0]; + const index_t Ho = arg.output_spatial_lengths_[1]; + const index_t Wo = arg.output_spatial_lengths_[2]; + + auto func = [&](auto n) { + for(index_t d_o = 0; d_o < Do; ++d_o) + { + for(index_t ho = 0; ho < Ho; ++ho) + { + for(index_t wo = 0; wo < Wo; ++wo) + { + index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo; + index_t column = 0; + + for(index_t z = 0; z < arg.filter_spatial_lengths_[0]; ++z) + { + auto di = + static_cast(d_o * arg.conv_strides_[0]) + + static_cast(z * arg.conv_dilations_[0]) - + static_cast(arg.in_left_pads_[0]); + for(index_t y = 0; y < arg.filter_spatial_lengths_[1]; ++y) + { + auto hi = + static_cast(ho * + arg.conv_strides_[1]) + + static_cast(y * + arg.conv_dilations_[1]) - + static_cast(arg.in_left_pads_[1]); + for(index_t x = 0; x < arg.filter_spatial_lengths_[2]; ++x) + { + auto wi = + static_cast( + wo * arg.conv_strides_[2]) + + static_cast( + x * arg.conv_dilations_[2]) - + static_cast(arg.in_left_pads_[2]); + for(index_t c = 0; c < C; ++c) + { + if(di >= 0 && + ck::type_convert(di) < + arg.output_.GetLengths()[3] && + hi >= 0 && + ck::type_convert(hi) < + arg.output_.GetLengths()[4] && + wi >= 0 && + ck::type_convert(wi) < + arg.output_.GetLengths()[5]) + { + float v_in = ck::type_convert( + arg.input_(row, column)); + float v_out = ck::type_convert( + arg.output_(0, n, c, di, hi, wi)); + arg.output_(0, n, c, di, hi, wi) = + ck::type_convert(v_in + v_out); + } + column++; + } + } + } + } + } + } + } + }; + + make_ParallelTensorFunctor(func, N)(std::thread::hardware_concurrency()); + + return 0; + } + } + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& /*stream_config*/ = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg)); + } + }; + + static constexpr bool IsValidCompilationParameter() + { + using namespace tensor_layout::convolution; + + if constexpr(!(std::is_same_v || std::is_same_v || + std::is_same_v)) + { + return false; + } + if constexpr(!(NDimSpatial >= 1 && NDimSpatial <= 3)) + { + return false; + } + return true; + } + + bool IsSupportedArgument(const Argument& arg) + { + const ck::index_t G = arg.output_.GetLengths()[0]; + const ck::index_t N = arg.output_.GetLengths()[1]; + const ck::index_t C = arg.output_.GetLengths()[2]; + + const index_t NDoHoWo = + N * ck::accumulate_n( + arg.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); + const index_t CZYX = + C * ck::accumulate_n( + arg.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); + + if(!(arg.input_.GetLengths()[0] == static_cast(NDoHoWo) && + arg.input_.GetLengths()[1] == static_cast(CZYX))) + { + return false; + } + + if(G != 1) + { + return false; + } + return true; + } + + bool IsSupportedArgument(const device::BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument(const Tensor& input, + Tensor& output, + std::vector filter_spatial_lengths, + std::vector conv_filter_strides, + std::vector conv_filter_dilations, + std::vector input_left_pads, + std::vector input_right_pads) + { + return Argument{input, + output, + filter_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads}; + } + + static auto MakeInvoker() { return Invoker{}; } + + virtual std::unique_ptr MakeInvokerPointer() + { + return std::make_unique(Invoker{}); + } + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "ReferenceColumnToImage" + << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace host +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp index 3f50ab88b3..9e12d07844 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp @@ -18,16 +18,18 @@ namespace host { /** * \brief Reference implementation for image to column. * - * Tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. + * Input tensor descriptor has [G, N, C, Di, Hi, Wi] data layout. * G must be equal to 1. Memory layout is [G, N, Di, Hi, Wi, C]. + * Output tensor descriptor has [N * Do * Ho * Wo, Z * Y * X * C] data layout. + * Memory layout is the same. * * \tparam NDimSpatial Number of spatial dimensions. - * \tparam InputLayout Input Layout. + * \tparam ImageLayout Image Layout. * \tparam InDataType Input Data Type. * \tparam OutDataType Output Data Type. */ template = 1 && NDimSpatial <= 3, bool>::type = false> @@ -240,8 +242,8 @@ struct ReferenceImageToColumn : public device::BaseOperator { using namespace tensor_layout::convolution; - if constexpr(!(std::is_same_v || std::is_same_v || - std::is_same_v)) + if constexpr(!(std::is_same_v || std::is_same_v || + std::is_same_v)) { return false; } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp new file mode 100644 index 0000000000..57c0f88567 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp @@ -0,0 +1,282 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +// Image to Column +// nhwc, 1d +void add_device_image_to_column_nwc_1d_bf16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nwc_1d_f16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nwc_1d_f32_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nwc_1d_i8_instances( + std::vector>>& + instances); +// nhwc, 2d +void add_device_image_to_column_nhwc_2d_bf16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nhwc_2d_f16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nhwc_2d_f32_instances( + std::vector>>& + instances); + +void add_device_image_to_column_nhwc_2d_i8_instances( + std::vector< + std::unique_ptr>>& + instances); +// nhwc, 3d +void add_device_image_to_column_ndhwc_3d_bf16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_ndhwc_3d_f16_instances( + std::vector>>& + instances); + +void add_device_image_to_column_ndhwc_3d_f32_instances( + std::vector>>& + instances); + +void add_device_image_to_column_ndhwc_3d_i8_instances( + std::vector< + std::unique_ptr>>& + instances); + +// Column to Image +// nhwc, 1d +void add_device_column_to_image_nwc_1d_bf16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nwc_1d_f16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nwc_1d_f32_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nwc_1d_i8_instances( + std::vector>>& + instances); +// nhwc, 2d +void add_device_column_to_image_nhwc_2d_bf16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nhwc_2d_f16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nhwc_2d_f32_instances( + std::vector>>& + instances); + +void add_device_column_to_image_nhwc_2d_i8_instances( + std::vector< + std::unique_ptr>>& + instances); +// nhwc, 3d +void add_device_column_to_image_ndhwc_3d_bf16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_ndhwc_3d_f16_instances( + std::vector>>& + instances); + +void add_device_column_to_image_ndhwc_3d_f32_instances( + std::vector>>& + instances); + +void add_device_column_to_image_ndhwc_3d_i8_instances( + std::vector< + std::unique_ptr>>& + instances); + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceConvTensorRearrange> +{ + using DeviceOp = DeviceConvTensorRearrange; + + static auto GetInstances() + { + std::vector> op_ptrs; + + if constexpr(is_same_v) + { + if constexpr(NumDimSpatial == 1 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nwc_1d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nwc_1d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_image_to_column_nwc_1d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nwc_1d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 2 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nhwc_2d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nhwc_2d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_image_to_column_nhwc_2d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_nhwc_2d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 3 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_ndhwc_3d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_ndhwc_3d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_image_to_column_ndhwc_3d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_image_to_column_ndhwc_3d_i8_instances(op_ptrs); + } + } + } + else if constexpr(is_same_v) + { + if constexpr(NumDimSpatial == 1 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nwc_1d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nwc_1d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_column_to_image_nwc_1d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nwc_1d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 2 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nhwc_2d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nhwc_2d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_column_to_image_nhwc_2d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_nhwc_2d_i8_instances(op_ptrs); + } + } + else if constexpr(NumDimSpatial == 3 && is_same_v) + { + if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_ndhwc_3d_f32_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_ndhwc_3d_f16_instances(op_ptrs); + } + else if constexpr(is_same_v && + is_same_v) + { + add_device_column_to_image_ndhwc_3d_bf16_instances(op_ptrs); + } + else if constexpr(is_same_v && is_same_v) + { + add_device_column_to_image_ndhwc_3d_i8_instances(op_ptrs); + } + } + } + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp new file mode 100644 index 0000000000..681f466677 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp @@ -0,0 +1,106 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp" + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::tensor_layout::convolution; +using namespace ck::conv_tensor_rearrange_op; + +using BF16 = ck::bhalf_t; +using F16 = ck::half_t; +using F32 = float; + +template +using S = ck::Sequence; + +template +using device_column_to_image_bf16_instances = std::tuple< + // clang-format off + //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| + //#####################| Dim| | | | Size| Block| Block| Cluster| Per| + //#####################| Spatial| | | | | | | Lengths| Vector| + //#####################| | | | | | | | | | + // generic instance + DeviceColumnToImageImpl, 1>, + + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 8>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 8>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 8> + // clang-format on + >; + +template +using device_column_to_image_f16_instances = std::tuple< + // clang-format off + //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| + //#####################| Dim| | | | Size| Block| Block| Cluster| Per| + //#####################| Spatial| | | | | | | Lengths| Vector| + //#####################| | | | | | | | | | + // generic instance + DeviceColumnToImageImpl, 1>, + + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 8>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 8>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 8> + // clang-format on + >; + +template +using device_column_to_image_f32_instances = std::tuple< + // clang-format off + //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| + //#####################| Dim| | | | Size| Block| Block| Cluster| Per| + //#####################| Spatial| | | | | | | Lengths| Vector| + //#####################| | | | | | | | | | + // generic instance + DeviceColumnToImageImpl, 1>, + + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 4> + // clang-format on + >; + +template +using device_column_to_image_i8_instances = std::tuple< + // clang-format off + //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| + //#####################| Dim| | | | Size| Block| Block| Cluster| Per| + //#####################| Spatial| | | | | | | Lengths| Vector| + //#####################| | | | | | | | | | + // generic instance + DeviceColumnToImageImpl, 1>, + + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 8>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 8>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 4>, + DeviceColumnToImageImpl, 8>, + DeviceColumnToImageImpl, 16> + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp similarity index 70% rename from library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp rename to library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp index a2603218b2..74a2155a04 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp @@ -13,6 +13,7 @@ namespace device { namespace instance { using namespace ck::tensor_layout::convolution; +using namespace ck::conv_tensor_rearrange_op; using BF16 = ck::bhalf_t; using F16 = ck::half_t; @@ -28,17 +29,12 @@ using device_image_to_column_bf16_instances = std::tuple< //#####################| Dim| | | | Size| Block| Block| Cluster| Per| //#####################| Spatial| | | | | | | Lengths| Vector| //#####################| | | | | | | | | | - DeviceImageToColumnImpl, 1>, + // generic instance DeviceImageToColumnImpl, 1>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 8>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 8>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 8> @@ -52,17 +48,13 @@ using device_image_to_column_f16_instances = std::tuple< //#####################| Dim| | | | Size| Block| Block| Cluster| Per| //#####################| Spatial| | | | | | | Lengths| Vector| //#####################| | | | | | | | | | - DeviceImageToColumnImpl, 1>, + // generic instance DeviceImageToColumnImpl, 1>, + DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 8>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 8>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 8> @@ -76,15 +68,11 @@ using device_image_to_column_f32_instances = std::tuple< //#####################| Dim| | | | Size| Block| Block| Cluster| Per| //#####################| Spatial| | | | | | | Lengths| Vector| //#####################| | | | | | | | | | - DeviceImageToColumnImpl, 1>, + // generic instance DeviceImageToColumnImpl, 1>, + DeviceImageToColumnImpl, 4>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, DeviceImageToColumnImpl, 4>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 4> // clang-format on @@ -97,17 +85,13 @@ using device_image_to_column_i8_instances = std::tuple< //#####################| Dim| | | | Size| Block| Block| Cluster| Per| //#####################| Spatial| | | | | | | Lengths| Vector| //#####################| | | | | | | | | | - DeviceImageToColumnImpl, 1>, + // generic instance DeviceImageToColumnImpl, 1>, + DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 8>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 8>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, - DeviceImageToColumnImpl, 1>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 4>, DeviceImageToColumnImpl, 8>, diff --git a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp b/library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp deleted file mode 100644 index 6c4526ba4e..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/image_to_column.hpp +++ /dev/null @@ -1,135 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" - -#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// nhwc, 1d -void add_device_image_to_column_nhwc_1d_bf16_instances( - std::vector>>& instances); - -void add_device_image_to_column_nhwc_1d_f16_instances( - std::vector>>& instances); - -void add_device_image_to_column_nhwc_1d_f32_instances( - std::vector>>& instances); - -void add_device_image_to_column_nhwc_1d_i8_instances( - std::vector>>& instances); -// nhwc, 2d -void add_device_image_to_column_nhwc_2d_bf16_instances( - std::vector>>& instances); - -void add_device_image_to_column_nhwc_2d_f16_instances( - std::vector>>& instances); - -void add_device_image_to_column_nhwc_2d_f32_instances( - std::vector>>& instances); - -void add_device_image_to_column_nhwc_2d_i8_instances( - std::vector>>& instances); -// nhwc, 3d -void add_device_image_to_column_nhwc_3d_bf16_instances( - std::vector>>& instances); - -void add_device_image_to_column_nhwc_3d_f16_instances( - std::vector>>& instances); - -void add_device_image_to_column_nhwc_3d_f32_instances( - std::vector>>& instances); - -void add_device_image_to_column_nhwc_3d_i8_instances( - std::vector>>& instances); - -template -struct DeviceOperationInstanceFactory< - ck::tensor_operation::device:: - DeviceImageToColumn> -{ - using DeviceOp = DeviceImageToColumn; - - static auto GetInstances() - { - std::vector> op_ptrs; - - if constexpr(NumDimSpatial == 1 && is_same_v) - { - if constexpr(is_same_v && is_same_v) - { - add_device_image_to_column_nhwc_1d_f32_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v) - { - add_device_image_to_column_nhwc_1d_f16_instances(op_ptrs); - } - else if constexpr(is_same_v && - is_same_v) - { - add_device_image_to_column_nhwc_1d_bf16_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v) - { - add_device_image_to_column_nhwc_1d_i8_instances(op_ptrs); - } - } - else if constexpr(NumDimSpatial == 2 && is_same_v) - { - if constexpr(is_same_v && is_same_v) - { - add_device_image_to_column_nhwc_2d_f32_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v) - { - add_device_image_to_column_nhwc_2d_f16_instances(op_ptrs); - } - else if constexpr(is_same_v && - is_same_v) - { - add_device_image_to_column_nhwc_2d_bf16_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v) - { - add_device_image_to_column_nhwc_2d_i8_instances(op_ptrs); - } - } - else if constexpr(NumDimSpatial == 3 && is_same_v) - { - if constexpr(is_same_v && is_same_v) - { - add_device_image_to_column_nhwc_3d_f32_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v) - { - add_device_image_to_column_nhwc_3d_f16_instances(op_ptrs); - } - else if constexpr(is_same_v && - is_same_v) - { - add_device_image_to_column_nhwc_3d_bf16_instances(op_ptrs); - } - else if constexpr(is_same_v && is_same_v) - { - add_device_image_to_column_nhwc_3d_i8_instances(op_ptrs); - } - } - - return op_ptrs; - } -}; - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt new file mode 100644 index 0000000000..5d1a554524 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/column_to_image/CMakeLists.txt @@ -0,0 +1,5 @@ +add_instance_library(device_column_to_image_instance + device_column_to_image_nhwc_1d_instance.cpp + device_column_to_image_nhwc_2d_instance.cpp + device_column_to_image_nhwc_3d_instance.cpp +) diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp new file mode 100644 index 0000000000..8ba4d29775 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_1d_instance.cpp @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +void add_device_column_to_image_nwc_1d_bf16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_BF16 + add_device_operation_instances(instances, device_column_to_image_bf16_instances<1, GNWC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nwc_1d_f16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP16 + add_device_operation_instances(instances, device_column_to_image_f16_instances<1, GNWC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nwc_1d_f32_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP32 + add_device_operation_instances(instances, device_column_to_image_f32_instances<1, GNWC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nwc_1d_i8_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_INT8 + add_device_operation_instances(instances, device_column_to_image_i8_instances<1, GNWC>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp new file mode 100644 index 0000000000..4de665a63f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_2d_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +void add_device_column_to_image_nhwc_2d_bf16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_BF16 + add_device_operation_instances(instances, device_column_to_image_bf16_instances<2, GNHWC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nhwc_2d_f16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP16 + add_device_operation_instances(instances, device_column_to_image_f16_instances<2, GNHWC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nhwc_2d_f32_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP32 + add_device_operation_instances(instances, device_column_to_image_f32_instances<2, GNHWC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_nhwc_2d_i8_instances( + std::vector< + std::unique_ptr>>& + instances) +{ +#ifdef CK_ENABLE_INT8 + add_device_operation_instances(instances, device_column_to_image_i8_instances<2, GNHWC>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp new file mode 100644 index 0000000000..9762b46c43 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/column_to_image/device_column_to_image_nhwc_3d_instance.cpp @@ -0,0 +1,62 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_column_to_image_instance.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using namespace ck::conv_tensor_rearrange_op; + +void add_device_column_to_image_ndhwc_3d_bf16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_BF16 + add_device_operation_instances(instances, device_column_to_image_bf16_instances<3, GNDHWC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_ndhwc_3d_f16_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP16 + add_device_operation_instances(instances, device_column_to_image_f16_instances<3, GNDHWC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_ndhwc_3d_f32_instances( + std::vector>>& + instances) +{ +#ifdef CK_ENABLE_FP32 + add_device_operation_instances(instances, device_column_to_image_f32_instances<3, GNDHWC>{}); +#else + ignore = instances; +#endif +} + +void add_device_column_to_image_ndhwc_3d_i8_instances( + std::vector< + std::unique_ptr>>& + instances) +{ +#ifdef CK_ENABLE_INT8 + add_device_operation_instances(instances, device_column_to_image_i8_instances<3, GNDHWC>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp index c8463623c3..3a629f2346 100644 --- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_1d_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -9,28 +9,50 @@ namespace tensor_operation { namespace device { namespace instance { -void add_device_image_to_column_nhwc_1d_bf16_instances( - std::vector>>& instances) +using namespace ck::conv_tensor_rearrange_op; + +void add_device_image_to_column_nwc_1d_bf16_instances( + std::vector>>& + instances) { +#ifdef CK_ENABLE_BF16 add_device_operation_instances(instances, device_image_to_column_bf16_instances<1, GNWC>{}); +#else + ignore = instances; +#endif } -void add_device_image_to_column_nhwc_1d_f16_instances( - std::vector>>& instances) +void add_device_image_to_column_nwc_1d_f16_instances( + std::vector>>& + instances) { +#ifdef CK_ENABLE_FP16 add_device_operation_instances(instances, device_image_to_column_f16_instances<1, GNWC>{}); +#else + ignore = instances; +#endif } -void add_device_image_to_column_nhwc_1d_f32_instances( - std::vector>>& instances) +void add_device_image_to_column_nwc_1d_f32_instances( + std::vector>>& + instances) { +#ifdef CK_ENABLE_FP32 add_device_operation_instances(instances, device_image_to_column_f32_instances<1, GNWC>{}); +#else + ignore = instances; +#endif } -void add_device_image_to_column_nhwc_1d_i8_instances( - std::vector>>& instances) +void add_device_image_to_column_nwc_1d_i8_instances( + std::vector>>& + instances) { +#ifdef CK_ENABLE_INT8 add_device_operation_instances(instances, device_image_to_column_i8_instances<1, GNWC>{}); +#else + ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp index 652c7fac2a..7115e75667 100644 --- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_2d_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -9,28 +9,51 @@ namespace tensor_operation { namespace device { namespace instance { +using namespace ck::conv_tensor_rearrange_op; + void add_device_image_to_column_nhwc_2d_bf16_instances( - std::vector>>& instances) + std::vector>>& + instances) { +#ifdef CK_ENABLE_BF16 add_device_operation_instances(instances, device_image_to_column_bf16_instances<2, GNHWC>{}); +#else + ignore = instances; +#endif } void add_device_image_to_column_nhwc_2d_f16_instances( - std::vector>>& instances) + std::vector>>& + instances) { +#ifdef CK_ENABLE_FP16 add_device_operation_instances(instances, device_image_to_column_f16_instances<2, GNHWC>{}); +#else + ignore = instances; +#endif } void add_device_image_to_column_nhwc_2d_f32_instances( - std::vector>>& instances) + std::vector>>& + instances) { +#ifdef CK_ENABLE_FP32 add_device_operation_instances(instances, device_image_to_column_f32_instances<2, GNHWC>{}); +#else + ignore = instances; +#endif } void add_device_image_to_column_nhwc_2d_i8_instances( - std::vector>>& instances) + std::vector< + std::unique_ptr>>& + instances) { +#ifdef CK_ENABLE_INT8 add_device_operation_instances(instances, device_image_to_column_i8_instances<2, GNHWC>{}); +#else + ignore = instances; +#endif } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp index 07774504d7..8290dae928 100644 --- a/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/image_to_column/device_image_to_column_nhwc_3d_instance.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. -#include "ck/library/tensor_operation_instance/gpu/image_to_column/device_image_to_column_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange/device_image_to_column_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" namespace ck { @@ -9,28 +9,51 @@ namespace tensor_operation { namespace device { namespace instance { -void add_device_image_to_column_nhwc_3d_bf16_instances( - std::vector>>& instances) +using namespace ck::conv_tensor_rearrange_op; + +void add_device_image_to_column_ndhwc_3d_bf16_instances( + std::vector>>& + instances) { +#ifdef CK_ENABLE_BF16 add_device_operation_instances(instances, device_image_to_column_bf16_instances<3, GNDHWC>{}); +#else + ignore = instances; +#endif } -void add_device_image_to_column_nhwc_3d_f16_instances( - std::vector>>& instances) +void add_device_image_to_column_ndhwc_3d_f16_instances( + std::vector>>& + instances) { +#ifdef CK_ENABLE_FP16 add_device_operation_instances(instances, device_image_to_column_f16_instances<3, GNDHWC>{}); +#else + ignore = instances; +#endif } -void add_device_image_to_column_nhwc_3d_f32_instances( - std::vector>>& instances) +void add_device_image_to_column_ndhwc_3d_f32_instances( + std::vector>>& + instances) { +#ifdef CK_ENABLE_FP32 add_device_operation_instances(instances, device_image_to_column_f32_instances<3, GNDHWC>{}); +#else + ignore = instances; +#endif } -void add_device_image_to_column_nhwc_3d_i8_instances( - std::vector>>& instances) +void add_device_image_to_column_ndhwc_3d_i8_instances( + std::vector< + std::unique_ptr>>& + instances) { +#ifdef CK_ENABLE_INT8 add_device_operation_instances(instances, device_image_to_column_i8_instances<3, GNDHWC>{}); +#else + ignore = instances; +#endif } } // namespace instance diff --git a/profiler/README.md b/profiler/README.md index d03bfa7fc4..d081fc33e9 100644 --- a/profiler/README.md +++ b/profiler/README.md @@ -185,7 +185,7 @@ GB/s: 69.2301 ``` Note: This kernel use atomic add, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time. -## Profile image to column kernels +## Profile image to column/column to image kernels ```bash # arg1: tensor operation (" OP_NAME ": " OP_DESC ") # arg2: data type (0: Input fp32, Weight fp32, Output fp32 @@ -197,6 +197,7 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate # arg5: initialization (0: no init, 1: integer value, 2: decimal value) # arg6: print tensor value (0: no; 1: yes) # arg7: time kernel (0: no, 1: yes) +# arg8: operation type (0: ImageToColumn, 1: ColumnToImage) # Following arguments (depending on number of spatial dims): # Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d) # G, N, K, C, @@ -207,8 +208,8 @@ Note: This kernel use atomic add, this will cause output buffer to be accumulate # , (ie LeftPy, LeftPx for 2D) # , (ie RightPy, RightPx for 2D) - ################ op datatype layout verify init log time Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx -./bin/ckProfiler image_to_column 0 0 1 1 0 1 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0 + ################ op datatype layout verify init log time opType Ndims G N K C Y X Hi Wi Sy Sx Dy Dx LeftPy LeftPx RightPy RightPx +./bin/ckProfiler conv_tensor_rearrange 0 0 0 1 0 1 0 2 1 256 1 512 3 3 28 28 1 1 1 1 0 0 0 0 ``` @@ -222,3 +223,4 @@ name: DeviceImageToColumn<128, 32, 64, 4> avg_time: 3.12326 GB/s: 2042.59 ``` +Note: Column to image kernel adds to the output memory, this will cause output buffer to be accumulated multiple times, causing verification failure. To work around it, do not use CK's own timer and do verification at the same time. diff --git a/profiler/include/profiler/profile_image_to_column_impl.hpp b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp similarity index 56% rename from profiler/include/profiler/profile_image_to_column_impl.hpp rename to profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp index cc929e9220..dc3d748bf6 100644 --- a/profiler/include/profiler/profile_image_to_column_impl.hpp +++ b/profiler/include/profiler/profile_conv_tensor_rearrange_impl.hpp @@ -9,9 +9,11 @@ #include #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_image_to_column.hpp" +#include "ck/tensor_operation/gpu/device/device_conv_tensor_rearrange.hpp" +#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" #include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp" -#include "ck/library/tensor_operation_instance/gpu/image_to_column.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp" +#include "ck/library/tensor_operation_instance/gpu/conv_tensor_rearrange.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" @@ -19,22 +21,88 @@ #include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_image_to_column.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_column_to_image.hpp" namespace ck { namespace profiler { template using S = ck::Sequence; +using namespace conv_tensor_rearrange_op; + +template +Tensor create_input(const HostTensorDescriptor& image_desc, + const HostTensorDescriptor& gemm_desc) +{ + if constexpr(std::is_same_v) + { + Tensor input(image_desc); + return input; + } + else if constexpr(std::is_same_v) + { + Tensor input(gemm_desc); + return input; + } + else + { + throw std::runtime_error("Unsupported op!"); + } +} + +template +Tensor create_output(const HostTensorDescriptor& image_desc, + const HostTensorDescriptor& gemm_desc) +{ + if constexpr(std::is_same_v) + { + Tensor output(gemm_desc); + return output; + } + else if constexpr(std::is_same_v) + { + Tensor output(image_desc); + return output; + } + else + { + throw std::runtime_error("Unsupported op!"); + } +} template -bool profile_image_to_column_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - const ck::utils::conv::ConvParam& conv_param) + typename OutputDataType, + typename ConvTensorRearrangeOp> +static auto make_ref_op() +{ + if constexpr(std::is_same_v) + { + return ck::tensor_operation::host:: + ReferenceImageToColumn{}; + } + else if constexpr(std::is_same_v) + { + return ck::tensor_operation::host:: + ReferenceColumnToImage{}; + } + else + { + throw std::runtime_error("Unsupported op!"); + } +} + +template +bool profile_conv_tensor_rearrange_impl(int do_verification, + int init_method, + bool do_log, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param) { const ck::index_t NDoHoWo = conv_param.N_ * @@ -45,16 +113,16 @@ bool profile_image_to_column_impl(int do_verification, ck::accumulate_n( conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); - const auto in_desc = + const auto image_desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed( conv_param); - const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX}); + const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX}); std::array input_spatial_lengths{}; std::array filter_spatial_lengths{}; std::array output_spatial_lengths{}; - std::array input_g_n_c_wis_strides{}; - std::array output_m_k_strides{}; + std::array image_g_n_c_wis_strides{}; + std::array gemm_m_k_strides{}; std::array conv_filter_strides{}; std::array conv_filter_dilations{}; std::array input_left_pads{}; @@ -65,16 +133,19 @@ bool profile_image_to_column_impl(int do_verification, copy(conv_param.input_spatial_lengths_, input_spatial_lengths); copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths); copy(conv_param.output_spatial_lengths_, output_spatial_lengths); - copy(in_desc.GetStrides(), input_g_n_c_wis_strides); - copy(out_desc.GetStrides(), output_m_k_strides); + copy(image_desc.GetStrides(), image_g_n_c_wis_strides); + copy(gemm_desc.GetStrides(), gemm_m_k_strides); copy(conv_param.conv_filter_strides_, conv_filter_strides); copy(conv_param.conv_filter_dilations_, conv_filter_dilations); copy(conv_param.input_left_pads_, input_left_pads); copy(conv_param.input_right_pads_, input_right_pads); - Tensor input(in_desc); - Tensor host_output(out_desc); - Tensor device_output(out_desc); + Tensor input = + create_input(image_desc, gemm_desc); + Tensor device_output = + create_output(image_desc, gemm_desc); + Tensor host_output = + create_output(image_desc, gemm_desc); std::cout << "input: " << input.mDesc << std::endl; std::cout << "output: " << host_output.mDesc << std::endl; @@ -94,17 +165,21 @@ bool profile_image_to_column_impl(int do_verification, // run reference op if(do_verification) { - auto ref_image_to_column = ck::tensor_operation::host:: - ReferenceImageToColumn{}; + auto ref_conv_tensor_rearrange = make_ref_op(); - auto ref_invoker = ref_image_to_column.MakeInvoker(); - auto ref_argument = ref_image_to_column.MakeArgument(input, - host_output, - conv_param.filter_spatial_lengths_, - conv_param.conv_filter_strides_, - conv_param.conv_filter_dilations_, - conv_param.input_left_pads_, - conv_param.input_right_pads_); + auto ref_invoker = ref_conv_tensor_rearrange.MakeInvoker(); + auto ref_argument = + ref_conv_tensor_rearrange.MakeArgument(input, + host_output, + conv_param.filter_spatial_lengths_, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); // init host output to zero host_output.SetZero(); @@ -112,8 +187,11 @@ bool profile_image_to_column_impl(int do_verification, ref_invoker.Run(ref_argument); } - using DeviceOp = ck::tensor_operation::device:: - DeviceImageToColumn; + using DeviceOp = ck::tensor_operation::device::DeviceConvTensorRearrange; // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< @@ -139,8 +217,8 @@ bool profile_image_to_column_impl(int do_verification, input_spatial_lengths, filter_spatial_lengths, output_spatial_lengths, - input_g_n_c_wis_strides, - output_m_k_strides, + image_g_n_c_wis_strides, + gemm_m_k_strides, conv_filter_strides, conv_filter_dilations, input_left_pads, diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 7da7613f26..8d9a47cd6b 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -28,7 +28,7 @@ set(PROFILER_SOURCES profile_contraction_bilinear.cpp profile_contraction_scale.cpp profile_grouped_conv_bwd_data.cpp - profile_image_to_column.cpp + profile_conv_tensor_rearrange.cpp ) if(DL_KERNELS) list(APPEND PROFILER_SOURCES profile_batched_gemm_multi_d.cpp) @@ -84,6 +84,7 @@ target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_max_pool_bwd_instanc target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv2d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_grouped_conv3d_bwd_data_instance) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_image_to_column_instance) +target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_column_to_image_instance) if(DL_KERNELS) target_link_libraries(${PROFILER_EXECUTABLE} PRIVATE device_batched_gemm_multi_d_instance) endif() diff --git a/profiler/src/profile_conv_tensor_rearrange.cpp b/profiler/src/profile_conv_tensor_rearrange.cpp new file mode 100644 index 0000000000..bad5ce40a5 --- /dev/null +++ b/profiler/src/profile_conv_tensor_rearrange.cpp @@ -0,0 +1,251 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include + +#include "profiler/profile_conv_tensor_rearrange_impl.hpp" +#include "profiler_operation_registry.hpp" + +namespace { + +enum struct RearrangeOp +{ + ImageToColumn, // 0 + ColumnToImage, // 1 +}; + +enum struct ConvLayout +{ + NHWC, // 0 +}; + +enum struct DataType +{ + F32_F32, // 0 + F16_F16, // 1 + BF16_BF16, // 2 + INT8_INT8, // 3 +}; + +#define OP_NAME "conv_tensor_rearrange" +#define OP_DESC "Conv Tensor Rearrange" + +static void print_helper_msg() +{ + std::cout + // clang-format off + << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" + << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" + << " 1: Input fp16, Weight fp16, Output fp16\n" + << " 2: Input bf16, Weight bf16, Output bf16\n" + << " 3: Input int8, Weight int8, Output int8)\n" + << "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n" + << "arg4: verification (0: no, 1: yes)\n" + << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" + << "arg6: print tensor value (0: no; 1: yes)\n" + << "arg7: time kernel (0: no, 1: yes)\n" + << "arg8: operation type (0: ImageToColumn, 1: ColumnToImage)\n" + << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; + // clang-format on +} + +} // namespace + +int profile_conv_tensor_rearrange(int argc, char* argv[]) +{ + // 9 for control, 1 for num_dim_spatial + if(argc < 10) + { + print_helper_msg(); + return 1; + } + + const auto data_type = static_cast(std::stoi(argv[2])); + const auto layout = static_cast(std::stoi(argv[3])); + const bool do_verification = std::stoi(argv[4]); + const int init_method = std::stoi(argv[5]); + const bool do_log = std::stoi(argv[6]); + const bool time_kernel = std::stoi(argv[7]); + const auto rearrange_op = static_cast(std::stoi(argv[8])); + const int num_dim_spatial = std::stoi(argv[9]); + + // 9 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial + if(argc != 9 + 1 + 4 + 6 * num_dim_spatial) + { + print_helper_msg(); + return 1; + } + + const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 10, argv); + + using F32 = float; + using F16 = ck::half_t; + using BF16 = ck::bhalf_t; + using INT8 = int8_t; + + using namespace ck::tensor_layout::convolution; + using namespace ck::conv_tensor_rearrange_op; + + constexpr auto I1 = ck::Number<1>{}; + constexpr auto I2 = ck::Number<2>{}; + constexpr auto I3 = ck::Number<3>{}; + + auto profile = [&](auto num_dim_spatial_tmp, + auto in_layout, + auto in_type, + auto out_type, + auto rearrange_op_type) { + constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value; + + using InLayout = decltype(in_layout); + + using InDataType = decltype(in_type); + using OutDataType = decltype(out_type); + + using Op = decltype(rearrange_op_type); + + bool pass = ck::profiler:: + profile_conv_tensor_rearrange_impl( + do_verification, init_method, do_log, time_kernel, params); + + return pass ? 0 : 1; + }; + + // Image To Column + if(rearrange_op == RearrangeOp::ImageToColumn) + { + // NHWC + if(layout == ConvLayout::NHWC) + { + if(num_dim_spatial == 1) + { + if(data_type == DataType::F32_F32) + { + return profile(I1, GNWC{}, F32{}, F32{}, ImageToColumn{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I1, GNWC{}, F16{}, F16{}, ImageToColumn{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I1, GNWC{}, BF16{}, BF16{}, ImageToColumn{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I1, GNWC{}, INT8{}, INT8{}, ImageToColumn{}); + } + } + else if(num_dim_spatial == 2) + { + if(data_type == DataType::F32_F32) + { + return profile(I2, GNHWC{}, F32{}, F32{}, ImageToColumn{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I2, GNHWC{}, F16{}, F16{}, ImageToColumn{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I2, GNHWC{}, BF16{}, BF16{}, ImageToColumn{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I2, GNHWC{}, INT8{}, INT8{}, ImageToColumn{}); + } + } + else if(num_dim_spatial == 3) + { + if(data_type == DataType::F32_F32) + { + return profile(I3, GNDHWC{}, F32{}, F32{}, ImageToColumn{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I3, GNDHWC{}, F16{}, F16{}, ImageToColumn{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I3, GNDHWC{}, BF16{}, BF16{}, ImageToColumn{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I3, GNDHWC{}, INT8{}, INT8{}, ImageToColumn{}); + } + } + } + } + else if(rearrange_op == RearrangeOp::ColumnToImage) + { + // NHWC + if(layout == ConvLayout::NHWC) + { + if(num_dim_spatial == 1) + { + if(data_type == DataType::F32_F32) + { + return profile(I1, GNWC{}, F32{}, F32{}, ColumnToImage{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I1, GNWC{}, F16{}, F16{}, ColumnToImage{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I1, GNWC{}, BF16{}, BF16{}, ColumnToImage{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I1, GNWC{}, INT8{}, INT8{}, ColumnToImage{}); + } + } + else if(num_dim_spatial == 2) + { + if(data_type == DataType::F32_F32) + { + return profile(I2, GNHWC{}, F32{}, F32{}, ColumnToImage{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I2, GNHWC{}, F16{}, F16{}, ColumnToImage{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I2, GNHWC{}, BF16{}, BF16{}, ColumnToImage{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I2, GNHWC{}, INT8{}, INT8{}, ColumnToImage{}); + } + } + else if(num_dim_spatial == 3) + { + if(data_type == DataType::F32_F32) + { + return profile(I3, GNDHWC{}, F32{}, F32{}, ColumnToImage{}); + } + else if(data_type == DataType::F16_F16) + { + return profile(I3, GNDHWC{}, F16{}, F16{}, ColumnToImage{}); + } + else if(data_type == DataType::BF16_BF16) + { + return profile(I3, GNDHWC{}, BF16{}, BF16{}, ColumnToImage{}); + } + else if(data_type == DataType::INT8_INT8) + { + return profile(I3, GNDHWC{}, INT8{}, INT8{}, ColumnToImage{}); + } + } + } + } + + std::cout << "this data_type & layout is not implemented" << std::endl; + return 1; +} + +REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_conv_tensor_rearrange); diff --git a/profiler/src/profile_image_to_column.cpp b/profiler/src/profile_image_to_column.cpp deleted file mode 100644 index bf4312a6cf..0000000000 --- a/profiler/src/profile_image_to_column.cpp +++ /dev/null @@ -1,169 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include - -#include "profiler/profile_image_to_column_impl.hpp" -#include "profiler_operation_registry.hpp" - -namespace { - -enum struct ConvLayout -{ - NHWC, // 0 -}; - -enum struct DataType -{ - F32_F32, // 0 - F16_F16, // 1 - BF16_BF16, // 2 - INT8_INT8, // 3 -}; - -#define OP_NAME "image_to_column" -#define OP_DESC "Image To Column" - -static void print_helper_msg() -{ - std::cout - // clang-format off - << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" - << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" - << " 1: Input fp16, Weight fp16, Output fp16\n" - << " 2: Input bf16, Weight bf16, Output bf16\n" - << " 3: Input int8, Weight int8, Output int8)\n" - << "arg3: tensor layout (0: Input[N, Hi, Wi, C], Output[N * Ho * Wo, Y * X * C])\n" - << "arg4: verification (0: no, 1: yes)\n" - << "arg5: initialization (0: no init, 1: integer value, 2: decimal value)\n" - << "arg6: print tensor value (0: no; 1: yes)\n" - << "arg7: time kernel (0: no, 1: yes)\n" - << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; - // clang-format on -} - -} // namespace - -int profile_image_to_column(int argc, char* argv[]) -{ - // 8 for control, 1 for num_dim_spatial - if(argc < 9) - { - print_helper_msg(); - return 1; - } - - const auto data_type = static_cast(std::stoi(argv[2])); - const auto layout = static_cast(std::stoi(argv[3])); - const bool do_verification = std::stoi(argv[4]); - const int init_method = std::stoi(argv[5]); - const bool do_log = std::stoi(argv[6]); - const bool time_kernel = std::stoi(argv[7]); - const int num_dim_spatial = std::stoi(argv[8]); - - // 8 for control, 1 for num_dim_spatial, 4 for G/N/K/C, and 6 * num_dim_spatial - if(argc != 8 + 1 + 4 + 6 * num_dim_spatial) - { - print_helper_msg(); - return 1; - } - - const auto params = ck::utils::conv::parse_conv_param(num_dim_spatial, 9, argv); - - using F32 = float; - using F16 = ck::half_t; - using BF16 = ck::bhalf_t; - using INT8 = int8_t; - - using namespace ck::tensor_layout::convolution; - - constexpr auto I1 = ck::Number<1>{}; - constexpr auto I2 = ck::Number<2>{}; - constexpr auto I3 = ck::Number<3>{}; - - auto profile = [&](auto num_dim_spatial_tmp, auto in_layout, auto in_type, auto out_type) { - constexpr ck::index_t NDimSpatial = num_dim_spatial_tmp.value; - - using InLayout = decltype(in_layout); - - using InDataType = decltype(in_type); - using OutDataType = decltype(out_type); - - bool pass = ck::profiler:: - profile_image_to_column_impl( - do_verification, init_method, do_log, time_kernel, params); - - return pass ? 0 : 1; - }; - - // NHWC - if(layout == ConvLayout::NHWC) - { - if(num_dim_spatial == 1) - { - if(data_type == DataType::F32_F32) - { - return profile(I1, GNWC{}, F32{}, F32{}); - } - else if(data_type == DataType::F16_F16) - { - return profile(I1, GNWC{}, F16{}, F16{}); - } - else if(data_type == DataType::BF16_BF16) - { - return profile(I1, GNWC{}, BF16{}, BF16{}); - } - else if(data_type == DataType::INT8_INT8) - { - return profile(I1, GNWC{}, INT8{}, INT8{}); - } - } - else if(num_dim_spatial == 2) - { - if(data_type == DataType::F32_F32) - { - return profile(I2, GNHWC{}, F32{}, F32{}); - } - else if(data_type == DataType::F16_F16) - { - return profile(I2, GNHWC{}, F16{}, F16{}); - } - else if(data_type == DataType::BF16_BF16) - { - return profile(I2, GNHWC{}, BF16{}, BF16{}); - } - else if(data_type == DataType::INT8_INT8) - { - return profile(I2, GNHWC{}, INT8{}, INT8{}); - } - } - else if(num_dim_spatial == 3) - { - if(data_type == DataType::F32_F32) - { - return profile(I3, GNDHWC{}, F32{}, F32{}); - } - else if(data_type == DataType::F16_F16) - { - return profile(I3, GNDHWC{}, F16{}, F16{}); - } - else if(data_type == DataType::BF16_BF16) - { - return profile(I3, GNDHWC{}, BF16{}, BF16{}); - } - else if(data_type == DataType::INT8_INT8) - { - return profile(I3, GNDHWC{}, INT8{}, INT8{}); - } - } - } - - std::cout << "this data_type & layout is not implemented" << std::endl; - - return 1; -} - -REGISTER_PROFILER_OPERATION(OP_NAME, OP_DESC, profile_image_to_column); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 368965aa8a..07dd675afc 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -155,7 +155,7 @@ add_subdirectory(contraction) add_subdirectory(pool) add_subdirectory(batched_gemm_multi_d) add_subdirectory(grouped_convnd_bwd_data) -add_subdirectory(image_to_column) +add_subdirectory(conv_tensor_rearrange) if(GPU_TARGETS MATCHES "gfx11") add_subdirectory(wmma_op) endif() diff --git a/test/conv_tensor_rearrange/CMakeLists.txt b/test/conv_tensor_rearrange/CMakeLists.txt new file mode 100644 index 0000000000..f6ad263242 --- /dev/null +++ b/test/conv_tensor_rearrange/CMakeLists.txt @@ -0,0 +1,4 @@ +add_gtest_executable(test_conv_tensor_rearrange test_conv_tensor_rearrange.cpp) +target_link_libraries(test_conv_tensor_rearrange PRIVATE utility device_image_to_column_instance device_column_to_image_instance) +add_gtest_executable(test_conv_tensor_rearrange_interface test_conv_tensor_rearrange_interface.cpp) +target_link_libraries(test_conv_tensor_rearrange_interface PRIVATE utility) diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp new file mode 100644 index 0000000000..7065b03e0d --- /dev/null +++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange.cpp @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include + +#include "profiler/profile_conv_tensor_rearrange_impl.hpp" + +template +class TestConvTensorRearrange : public ::testing::Test +{ + protected: + using ImLayout = std::tuple_element_t<0, Tuple>; + using ConvTensorRearrangeOp = std::tuple_element_t<1, Tuple>; + + std::vector conv_params; + + template + void Run() + { + EXPECT_FALSE(conv_params.empty()); + bool pass = true; + for(auto& param : conv_params) + { + pass = pass && ck::profiler::profile_conv_tensor_rearrange_impl( + true, // do_verification + 1, // init_method: integer value + false, // do_log + false, // time_kernel + param); + } + EXPECT_TRUE(pass); + } +}; + +using namespace ck::tensor_layout::convolution; +using namespace ck::conv_tensor_rearrange_op; + +using KernelTypes1d = + ::testing::Types, std::tuple>; + +using KernelTypes2d = + ::testing::Types, std::tuple>; + +using KernelTypes3d = + ::testing::Types, std::tuple>; + +template +class TestConvTensorRearrange1d : public TestConvTensorRearrange +{ +}; + +template +class TestConvTensorRearrange2d : public TestConvTensorRearrange +{ +}; + +template +class TestConvTensorRearrange3d : public TestConvTensorRearrange +{ +}; + +TYPED_TEST_SUITE(TestConvTensorRearrange1d, KernelTypes1d); +TYPED_TEST_SUITE(TestConvTensorRearrange2d, KernelTypes2d); +TYPED_TEST_SUITE(TestConvTensorRearrange3d, KernelTypes3d); + +TYPED_TEST(TestConvTensorRearrange1d, Test1D) +{ + this->conv_params.clear(); + + this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}}); + this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}}); + this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {3}, {1}, {0}, {0}}); + this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}}); + // ScalarPerVector should be 1 + this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}}); + // stride != 1 + this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}}); + // dilation != 1 + this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}}); +#ifdef CK_ENABLE_FP32 + this->template Run<1, float, float>(); +#endif +#ifdef CK_ENABLE_BF16 + this->template Run<1, ck::bhalf_t, ck::bhalf_t>(); +#endif +#ifdef CK_ENABLE_FP16 + this->template Run<1, ck::half_t, ck::half_t>(); +#endif +#ifdef CK_ENABLE_INT8 + this->template Run<1, int8_t, int8_t>(); +#endif +} + +TYPED_TEST(TestConvTensorRearrange2d, Test2D) +{ + this->conv_params.clear(); + + this->conv_params.push_back( + {2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back( + {2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); + this->conv_params.push_back( + {2, 1, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}}); +#ifdef CK_ENABLE_FP32 + this->template Run<2, float, float>(); +#endif +#ifdef CK_ENABLE_BF16 + this->template Run<2, ck::bhalf_t, ck::bhalf_t>(); +#endif +#ifdef CK_ENABLE_FP16 + this->template Run<2, ck::half_t, ck::half_t>(); +#endif +#ifdef CK_ENABLE_INT8 + this->template Run<2, int8_t, int8_t>(); +#endif +} + +TYPED_TEST(TestConvTensorRearrange3d, Test3D) +{ + this->conv_params.clear(); + this->conv_params.push_back( + {3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {3, 3, 3}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); + this->conv_params.push_back( + {3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); + this->conv_params.push_back( + {3, 1, 64, 1, 64, {3, 3, 3}, {14, 14, 14}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}}); +#ifdef CK_ENABLE_FP32 + this->template Run<3, float, float>(); +#endif +#ifdef CK_ENABLE_BF16 + this->template Run<3, ck::bhalf_t, ck::bhalf_t>(); +#endif +#ifdef CK_ENABLE_FP16 + this->template Run<3, ck::half_t, ck::half_t>(); +#endif +#ifdef CK_ENABLE_INT8 + this->template Run<3, int8_t, int8_t>(); +#endif +} diff --git a/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp b/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp new file mode 100644 index 0000000000..57fcdc4269 --- /dev/null +++ b/test/conv_tensor_rearrange/test_conv_tensor_rearrange_interface.cpp @@ -0,0 +1,260 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2023, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_column_to_image_impl.hpp" +#include "ck/tensor_operation/gpu/device/conv_tensor_rearrange_op.hpp" + +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" + +#include + +using DataType = float; +using ImLayout = ck::tensor_layout::convolution::GNWC; + +template +using S = ck::Sequence; + +using namespace ck::conv_tensor_rearrange_op; + +template +class TestConvTensorRearrangeInterface : public ::testing::Test +{ + protected: + static constexpr ck::index_t NDimSpatial = 1; + + // clang-format off + using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl + // Num| ImLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| + // Dim| | | | Size| Block| Block| Cluster| Per| + // Spatial| | | | | | | Lengths| Vector| + // | | | | | | | | | + < NDimSpatial, ImLayout, DataType, DataType, 256, 128, 128, S<16, 16>,ScalarPerVector>; + using DeviceColToimgInstance = ck::tensor_operation::device::DeviceColumnToImageImpl + // Num| ImLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| + // Dim| | | | Size| Block| Block| Cluster| Per| + // Spatial| | | | | | | Lengths| Vector| + // | | | | | | | | | + < NDimSpatial, ImLayout, DataType, DataType, 256, 128, 128, S<16, 16>,ScalarPerVector>; + // clang-format on + + ck::utils::conv::ConvParam conv_param; + + template + bool Run() + { + + const auto N = conv_param.N_; + const auto C = conv_param.C_; + const auto FakeC = + conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed + + const ck::index_t NDoHoWo = + N * + ck::accumulate_n( + conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); + const ck::index_t CZYX = + C * + ck::accumulate_n( + conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); + + const auto image_desc = + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed( + conv_param); + const auto gemm_desc = HostTensorDescriptor({NDoHoWo, CZYX}); + + std::array input_spatial_lengths{}; + std::array filter_spatial_lengths{}; + std::array output_spatial_lengths{}; + std::array input_g_n_c_wis_strides{}; + std::array output_m_k_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; + + copy(conv_param.input_spatial_lengths_, input_spatial_lengths); + copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths); + copy(conv_param.output_spatial_lengths_, output_spatial_lengths); + copy(image_desc.GetStrides(), input_g_n_c_wis_strides); + copy(gemm_desc.GetStrides(), output_m_k_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + if constexpr(std::is_same_v) + { + auto img2col = DeviceImgToColInstance{}; + auto argument = img2col.MakeArgument(nullptr, + nullptr, + N, + IsCPacked ? C : FakeC, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + input_g_n_c_wis_strides, + output_m_k_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + return img2col.IsSupportedArgument(argument); + } + else if constexpr(std::is_same_v) + { + auto col2img = DeviceColToimgInstance{}; + auto argument = col2img.MakeArgument(nullptr, + nullptr, + N, + IsCPacked ? C : FakeC, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + input_g_n_c_wis_strides, + output_m_k_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads); + + return col2img.IsSupportedArgument(argument); + } + } +}; + +class TestConvTensorRearrangeInterface1ScalarPerVector + : public TestConvTensorRearrangeInterface<1, true> +{ +}; + +class TestConvTensorRearrangeInterface4ScalarPerVector + : public TestConvTensorRearrangeInterface<4, true> +{ +}; + +class TestConvTensorRearrangeInterface4ScalarPerVectorFakeC + : public TestConvTensorRearrangeInterface<4, false> +{ +}; + +TEST_F(TestConvTensorRearrangeInterface1ScalarPerVector, X1ScalarPerVector) +{ + // vector load C * X % ScalarPerVector + this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}}; + bool is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + // vector load C * left_pad_x % ScalarPerVector + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}}; + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + // vector load C * right_pad_x % ScalarPerVector + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}}; + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + // vector load C % ScalarPerVector, right_pad and stride + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}}; + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + // vector load C % ScalarPerVector, left_pad and stride + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}}; + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + // vector load C % ScalarPerVector, dilation + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}}; + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + // C = 4 + this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}}; + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); +} + +TEST_F(TestConvTensorRearrangeInterface4ScalarPerVector, X4ScalarPerVector) +{ + // vector load C * X % ScalarPerVector + this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}}; + bool is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + // vector load C * left_pad_x % ScalarPerVector + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}}; + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + // vector load C * right_pad_x % ScalarPerVector + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}}; + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + // vector load C % ScalarPerVector, right_pad and stride + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}}; + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + // vector load C % ScalarPerVector, left_pad and stride + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}}; + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + // vector load C % ScalarPerVector, dilation + this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}}; + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + // C = 4 + this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}}; + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); +} + +TEST_F(TestConvTensorRearrangeInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC) +{ + // C = 3 + this->conv_param = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}}; + bool is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + is_supported = this->template Run(); + EXPECT_FALSE(is_supported); + // C = 4 + this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}}; + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); + is_supported = this->template Run(); + EXPECT_TRUE(is_supported); +} diff --git a/test/image_to_column/CMakeLists.txt b/test/image_to_column/CMakeLists.txt deleted file mode 100644 index 0feb827b55..0000000000 --- a/test/image_to_column/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -add_gtest_executable(test_image_to_column test_image_to_column.cpp) -target_link_libraries(test_image_to_column PRIVATE utility device_image_to_column_instance) -add_gtest_executable(test_image_to_column_interface test_image_to_column_interface.cpp) -target_link_libraries(test_image_to_column_interface PRIVATE utility) diff --git a/test/image_to_column/test_image_to_column.cpp b/test/image_to_column/test_image_to_column.cpp deleted file mode 100644 index 0b17cac2d0..0000000000 --- a/test/image_to_column/test_image_to_column.cpp +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include - -#include - -#include "profiler/profile_image_to_column_impl.hpp" - -template -class TestImageToColumn : public ::testing::Test -{ - protected: - using InDataType = std::tuple_element_t<0, Tuple>; - using OutDataType = std::tuple_element_t<1, Tuple>; - using InLayout = std::tuple_element_t<2, Tuple>; - - std::vector conv_params; - - template - void Run() - { - EXPECT_FALSE(conv_params.empty()); - bool pass = true; - for(auto& param : conv_params) - { - pass = pass && ck::profiler::profile_image_to_column_impl( - true, // do_verification - 1, // init_method: integer value - false, // do_log - false, // time_kernel - param); - } - EXPECT_TRUE(pass); - } -}; - -using namespace ck::tensor_layout::convolution; - -using KernelTypes1d = ::testing::Types, - std::tuple, - std::tuple, - std::tuple>; - -using KernelTypes2d = ::testing::Types, - std::tuple, - std::tuple, - std::tuple>; - -using KernelTypes3d = ::testing::Types, - std::tuple, - std::tuple, - std::tuple>; - -template -class TestImageToColumn1d : public TestImageToColumn -{ -}; - -template -class TestImageToColumn2d : public TestImageToColumn -{ -}; - -template -class TestImageToColumn3d : public TestImageToColumn -{ -}; - -TYPED_TEST_SUITE(TestImageToColumn1d, KernelTypes1d); -TYPED_TEST_SUITE(TestImageToColumn2d, KernelTypes2d); -TYPED_TEST_SUITE(TestImageToColumn3d, KernelTypes3d); - -TYPED_TEST(TestImageToColumn1d, Test1D) -{ - this->conv_params.clear(); - - this->conv_params.push_back({1, 1, 4, 1, 192, {3}, {28}, {1}, {1}, {1}, {1}}); - this->conv_params.push_back({1, 1, 64, 1, 64, {3}, {14}, {1}, {1}, {1}, {1}}); - this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {7}, {2}, {1}, {0}, {0}}); - this->conv_params.push_back({1, 1, 64, 1, 64, {1}, {3}, {1}, {1}, {0}, {0}}); - // ScalarPerVector should be 1 - this->conv_params.push_back({1, 1, 4, 1, 1, {3}, {28}, {1}, {1}, {1}, {1}}); - // stride != 1 - this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {2}, {1}, {1}, {1}}); - // dilation != 1 - this->conv_params.push_back({1, 1, 1, 1, 4, {3}, {28}, {1}, {2}, {1}, {1}}); - this->template Run<1>(); -} - -TYPED_TEST(TestImageToColumn2d, Test2D) -{ - this->conv_params.clear(); - - this->conv_params.push_back( - {2, 1, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); - this->conv_params.push_back( - {2, 1, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); - this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {2, 2}, {1, 1}, {0, 0}, {0, 0}}); - this->conv_params.push_back({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); - this->template Run<2>(); -} - -TYPED_TEST(TestImageToColumn3d, Test3D) -{ - this->conv_params.clear(); - this->conv_params.push_back( - {3, 1, 16, 1, 64, {1, 1, 1}, {7, 7, 7}, {2, 2, 2}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); - this->conv_params.push_back( - {3, 1, 2, 1, 64, {3, 3, 3}, {14, 14, 3}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}); - this->conv_params.push_back( - {3, 1, 32, 1, 64, {1, 1, 1}, {3, 3, 3}, {1, 1, 1}, {1, 1, 1}, {0, 0, 0}, {0, 0, 0}}); - this->template Run<3>(); -} diff --git a/test/image_to_column/test_image_to_column_interface.cpp b/test/image_to_column/test_image_to_column_interface.cpp deleted file mode 100644 index ea8b9632e1..0000000000 --- a/test/image_to_column/test_image_to_column_interface.cpp +++ /dev/null @@ -1,196 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_image_to_column_impl.hpp" - -#include "ck/library/utility/convolution_parameter.hpp" -#include "ck/library/utility/algorithm.hpp" -#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" - -#include - -using DataType = float; -using InLayout = ck::tensor_layout::convolution::GNWC; - -template -using S = ck::Sequence; - -template -class TestImageToColumnInterface : public ::testing::Test -{ - protected: - static constexpr ck::index_t NDimSpatial = 1; - - // clang-format off - using DeviceImgToColInstance = ck::tensor_operation::device::DeviceImageToColumnImpl - //#####################| Num| InLayout| InDataType| OutDataType| Block| MPer| KPer| Thread| Scalar| - //#####################| Dim| | | | Size| Block| Block| Cluster| Per| - //#####################| Spatial| | | | | | | Lengths| Vector| - //#####################| | | | | | | | | | - < NDimSpatial, InLayout, DataType, DataType, 256, 128, 128, S<16, 16>,ScalarPerVector>; - // clang-format on - - ck::utils::conv::ConvParam conv_param; - - bool Run() - { - - const auto N = conv_param.N_; - const auto C = conv_param.C_; - const auto FakeC = - conv_param.C_ / 2; // Fake C to simulate the behavior that C is not packed - - const ck::index_t NDoHoWo = - N * - ck::accumulate_n( - conv_param.output_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); - const ck::index_t CZYX = - C * - ck::accumulate_n( - conv_param.filter_spatial_lengths_.begin(), NDimSpatial, 1, std::multiplies<>()); - - const auto in_desc = - ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed( - conv_param); - const auto out_desc = HostTensorDescriptor({NDoHoWo, CZYX}); - - std::array input_spatial_lengths{}; - std::array filter_spatial_lengths{}; - std::array output_spatial_lengths{}; - std::array input_g_n_c_wis_strides{}; - std::array output_m_k_strides{}; - std::array conv_filter_strides{}; - std::array conv_filter_dilations{}; - std::array input_left_pads{}; - std::array input_right_pads{}; - - auto copy = [](const auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; - - copy(conv_param.input_spatial_lengths_, input_spatial_lengths); - copy(conv_param.filter_spatial_lengths_, filter_spatial_lengths); - copy(conv_param.output_spatial_lengths_, output_spatial_lengths); - copy(in_desc.GetStrides(), input_g_n_c_wis_strides); - copy(out_desc.GetStrides(), output_m_k_strides); - copy(conv_param.conv_filter_strides_, conv_filter_strides); - copy(conv_param.conv_filter_dilations_, conv_filter_dilations); - copy(conv_param.input_left_pads_, input_left_pads); - copy(conv_param.input_right_pads_, input_right_pads); - - auto img2col = DeviceImgToColInstance{}; - auto argument = img2col.MakeArgument(nullptr, - nullptr, - N, - IsCPacked ? C : FakeC, - input_spatial_lengths, - filter_spatial_lengths, - output_spatial_lengths, - input_g_n_c_wis_strides, - output_m_k_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads); - - return img2col.IsSupportedArgument(argument); - } -}; - -class TestImageToColumnInterface1ScalarPerVector : public TestImageToColumnInterface<1, true> -{ -}; - -class TestImageToColumnInterface4ScalarPerVector : public TestImageToColumnInterface<4, true> -{ -}; - -class TestImageToColumnInterface4ScalarPerVectorFakeC : public TestImageToColumnInterface<4, false> -{ -}; - -TEST_F(TestImageToColumnInterface1ScalarPerVector, X1ScalarPerVector) -{ - // vector load C * X % ScalarPerVector - this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}}; - bool is_supported = this->Run(); - EXPECT_TRUE(is_supported); - // vector load C * left_pad_x % ScalarPerVector - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}}; - is_supported = this->Run(); - EXPECT_TRUE(is_supported); - // vector load C * right_pad_x % ScalarPerVector - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}}; - is_supported = this->Run(); - EXPECT_TRUE(is_supported); - - // vector load C % ScalarPerVector, right_pad and stride - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}}; - is_supported = this->Run(); - EXPECT_TRUE(is_supported); - // vector load C % ScalarPerVector, left_pad and stride - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}}; - is_supported = this->Run(); - EXPECT_TRUE(is_supported); - // vector load C % ScalarPerVector, dilation - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}}; - is_supported = this->Run(); - EXPECT_TRUE(is_supported); - - // C = 4 - this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}}; - is_supported = this->Run(); - EXPECT_TRUE(is_supported); -} - -TEST_F(TestImageToColumnInterface4ScalarPerVector, X4ScalarPerVector) -{ - // vector load C * X % ScalarPerVector - this->conv_param = {1, 1, 1, 1, 1, {3}, {3}, {1}, {1}, {0}, {0}}; - bool is_supported = this->Run(); - EXPECT_FALSE(is_supported); - // vector load C * left_pad_x % ScalarPerVector - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {3}, {0}}; - is_supported = this->Run(); - EXPECT_FALSE(is_supported); - // vector load C * right_pad_x % ScalarPerVector - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {1}, {0}, {3}}; - is_supported = this->Run(); - EXPECT_FALSE(is_supported); - - // vector load C % ScalarPerVector, right_pad and stride - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {0}, {3}}; - is_supported = this->Run(); - EXPECT_FALSE(is_supported); - // vector load C % ScalarPerVector, left_pad and stride - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {2}, {1}, {3}, {0}}; - is_supported = this->Run(); - EXPECT_FALSE(is_supported); - // vector load C % ScalarPerVector, dilation - this->conv_param = {1, 1, 1, 1, 1, {4}, {3}, {1}, {2}, {0}, {0}}; - is_supported = this->Run(); - EXPECT_FALSE(is_supported); - - // C = 4 - this->conv_param = {1, 1, 1, 1, 4, {3}, {3}, {1}, {1}, {3}, {3}}; - is_supported = this->Run(); - EXPECT_TRUE(is_supported); -} - -TEST_F(TestImageToColumnInterface4ScalarPerVectorFakeC, X4ScalarPerVectorFakeC) -{ - // C = 3 - this->conv_param = {1, 1, 1, 1, 3, {4}, {3}, {1}, {1}, {0}, {0}}; - bool is_supported = this->Run(); - EXPECT_FALSE(is_supported); - // C = 4 - this->conv_param = {1, 1, 1, 1, 8, {4}, {3}, {1}, {1}, {0}, {0}}; - is_supported = this->Run(); - EXPECT_TRUE(is_supported); -}