diff --git a/example/ck_tile/04_img2col/CMakeLists.txt b/example/ck_tile/04_img2col/CMakeLists.txt new file mode 100644 index 0000000000..3864c9ed9d --- /dev/null +++ b/example/ck_tile/04_img2col/CMakeLists.txt @@ -0,0 +1,3 @@ +# not using add_example_executable() to add this target, since we don't want this to have +# to be included in "make all/install/check" +add_executable(tile_example_img2col EXCLUDE_FROM_ALL image_to_column.cpp) diff --git a/example/ck_tile/04_img2col/README.md b/example/ck_tile/04_img2col/README.md new file mode 100644 index 0000000000..6ae2cea5e5 --- /dev/null +++ b/example/ck_tile/04_img2col/README.md @@ -0,0 +1,12 @@ +# Image to Column + +This folder contains example for Image to Column using ck_tile tile-programming implementation. + +## build +``` +# in the root of ck_tile +mkdir build && cd build +sh ../script/cmake-ck-dev.sh ../ # you can replace this to gfx90a, gfx942... +make tile_example_img2col -j +``` +This will result in an executable `build/bin/tile_example_img2col` diff --git a/example/ck_tile/04_img2col/image_to_column.cpp b/example/ck_tile/04_img2col/image_to_column.cpp new file mode 100644 index 0000000000..6380cd2994 --- /dev/null +++ b/example/ck_tile/04_img2col/image_to_column.cpp @@ -0,0 +1,170 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include + +#include "ck_tile/host.hpp" +#include "image_to_column.hpp" + +// Host API implementation +template <> +float image_to_column(const image_to_column_traits& traits, + const image_to_column_args<2>& args, + const ck_tile::stream_config& stream_conf) +{ + if(traits.data_type.compare("fp16") == 0) + { + constexpr ck_tile::index_t NDimSpatial = 2; + constexpr ck_tile::index_t VectorSize = 8; + + using thread_tile = ck_tile::sequence<8, 8>; + using warp_tile = ck_tile::sequence<64, 64>; + using block_tile = ck_tile::sequence<128, 128>; + + using Shape = ck_tile::TileImageToColumnShape; + + using InDataType = ck_tile::half_t; + using OutDataType = ck_tile::half_t; + + using PipelineProblem = ck_tile::BlockImageToColumnProblem; + + using Kernel = ck_tile::ImageToColumn; + + auto kargs = Kernel::MakeKargs(args.p_in, + args.p_out, + args.G, + args.N, + args.C, + args.input_spatial_lengths, + args.filter_spatial_lengths, + args.output_spatial_lengths, + args.image_g_n_c_wis_strides, + args.gemm_g_m_k_strides, + args.conv_filter_strides, + args.conv_filter_dilations, + args.input_left_pads, + args.input_right_pads); + + const dim3 grids = Kernel::GridSize( + args.N * args.output_spatial_lengths[0] * args.output_spatial_lengths[1], + args.filter_spatial_lengths[0] * args.filter_spatial_lengths[1] * args.C, + args.G); + constexpr dim3 blocks = Kernel::BlockSize(); + + constexpr ck_tile::index_t kBlockPerCu = 2; + + float ave_time = ck_tile::launch_kernel( + stream_conf, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + return ave_time; + } + + return 0; +} + +int main(int argc, char* argv[]) +{ + constexpr ck_tile::index_t NDimSpatial = 2; + + ExecutionConfig config; + ck_tile::conv::ConvParam conv_params = DefaultConvParams; + + if(!parse_cmd_args(argc, argv, config, conv_params)) + { + return EXIT_FAILURE; + } + + if(conv_params.num_dim_spatial_ != NDimSpatial) + { + std::cerr << "unsupported # of spatial dimensions" << std::endl; + return EXIT_FAILURE; + } + + using InDataType = ck_tile::half_t; + using OutDataType = ck_tile::half_t; + using ImLayout = ck_tile::tensor_layout::convolution::NHWGC; + + const auto G = conv_params.G_; + const auto N = conv_params.N_; + const auto C = conv_params.C_; + + const ck_tile::long_index_t NHoWo = + N * std::accumulate(conv_params.output_spatial_lengths_.begin(), + std::next(conv_params.output_spatial_lengths_.begin(), NDimSpatial), + 1, + std::multiplies<>()); + + const ck_tile::long_index_t CYX = + C * std::accumulate(conv_params.filter_spatial_lengths_.begin(), + std::next(conv_params.filter_spatial_lengths_.begin(), NDimSpatial), + 1, + std::multiplies<>()); + + const auto in_desc = + ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed(conv_params); + const auto out_desc = ck_tile::HostTensorDescriptor({G, NHoWo, CYX}); + + // host verify + ck_tile::HostTensor in(in_desc); + ck_tile::HostTensor out_device(out_desc); + ck_tile::HostTensor out_host(out_desc); + + switch(config.init_method) + { + case 0: break; + case 1: ck_tile::FillUniformDistributionIntegerValue{-5.f, 5.f}(in); break; + default: ck_tile::FillUniformDistribution{-0.5, 0.5}(in); break; + } + + ck_tile::DeviceMem in_device_buf(in.get_element_space_size_in_bytes()); + ck_tile::DeviceMem out_device_buf(out_device.get_element_space_size_in_bytes()); + + in_device_buf.ToDevice(in.data()); + + image_to_column_traits traits{"fp16"}; + + image_to_column_args args{ + in_device_buf.GetDeviceBuffer(), + out_device_buf.GetDeviceBuffer(), + G, + N, + C, + ck_tile::to_array(conv_params.input_spatial_lengths_), + ck_tile::to_array(conv_params.filter_spatial_lengths_), + ck_tile::to_array(conv_params.output_spatial_lengths_), + ck_tile::to_array(in_desc.get_strides()), + ck_tile::to_array(out_desc.get_strides()), + ck_tile::to_array(conv_params.conv_filter_strides_), + ck_tile::to_array(conv_params.conv_filter_dilations_), + ck_tile::to_array(conv_params.input_left_pads_), + ck_tile::to_array(conv_params.input_right_pads_)}; + + float ave_time = + image_to_column(traits, args, ck_tile::stream_config{nullptr, config.time_kernel}); + + std::size_t num_btype = G * NHoWo * CYX * (sizeof(OutDataType) + sizeof(InDataType)); + float gb_per_sec = num_btype / 1.E6 / ave_time; + std::cout << "Perf: " << ave_time << " ms, " << gb_per_sec << " GB/s" << std::endl; + + bool pass = true; + + if(config.do_verification) + { + // reference + ck_tile::reference_im2col(in, out_host, conv_params); + + out_device_buf.FromDevice(out_device.data()); + pass = ck_tile::check_err(out_device, out_host); + + std::cout << "valid:" << (pass ? "y" : "n") << std::endl; + } + + return !pass; +} diff --git a/example/ck_tile/04_img2col/image_to_column.hpp b/example/ck_tile/04_img2col/image_to_column.hpp new file mode 100644 index 0000000000..90484e08ec --- /dev/null +++ b/example/ck_tile/04_img2col/image_to_column.hpp @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/image_to_column.hpp" +#include + +#define DefaultConvParams \ + ck_tile::conv::ConvParam \ + { \ + 2, 2, 32, 32, 32, {4, 4}, {64, 64}, {1, 1}, {1, 1}, {0, 0}, { 0, 0 } \ + } + +struct ExecutionConfig final +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; +}; + +inline void print_help_msg() +{ + std::cerr << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: time kernel (0=no, 1=yes)\n" + << ck_tile::conv::get_conv_param_parser_helper_msg() << std::endl; +} + +inline bool parse_cmd_args(int argc, + char* argv[], + ExecutionConfig& config, + ck_tile::conv::ConvParam& conv_params) +{ + constexpr int num_execution_config_args = + 3; // arguments for do_verification, init_method, time_kernel + constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_ + + constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args; + constexpr int threshold_to_catch_all_args = + threshold_to_catch_partial_args + num_conv_param_leading_args; + + if(argc == 1) + { + // use default + config = ExecutionConfig{}; + } + // catch only ExecutionConfig arguments + else if(argc == threshold_to_catch_partial_args) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + } + // catch both ExecutionConfig & ConvParam arguments + else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0)) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + + const ck_tile::index_t num_dim_spatial = std::stoi(argv[4]); + conv_params = + ck_tile::conv::parse_conv_param(num_dim_spatial, threshold_to_catch_partial_args, argv); + } + else + { + print_help_msg(); + return false; + } + + return true; +} + +struct image_to_column_traits +{ + std::string data_type; +}; + +template +struct image_to_column_args +{ + const void* p_in; + void* p_out; + const ck_tile::long_index_t G; + const ck_tile::long_index_t N; + const ck_tile::long_index_t C; + const ck_tile::array input_spatial_lengths; + const ck_tile::array filter_spatial_lengths; + const ck_tile::array output_spatial_lengths; + const ck_tile::array image_g_n_c_wis_strides; + const ck_tile::array gemm_g_m_k_strides; + const ck_tile::array conv_filter_strides; + const ck_tile::array conv_filter_dilations; + const ck_tile::array input_left_pads; + const ck_tile::array input_right_pads; +}; + +// host API +template +float image_to_column(const image_to_column_traits&, + const image_to_column_args&, + const ck_tile::stream_config&); diff --git a/example/ck_tile/CMakeLists.txt b/example/ck_tile/CMakeLists.txt index 3b4d1ca8be..fe1e9c9edf 100644 --- a/example/ck_tile/CMakeLists.txt +++ b/example/ck_tile/CMakeLists.txt @@ -5,3 +5,4 @@ include_directories(AFTER add_subdirectory(01_fmha) add_subdirectory(02_layernorm2d) add_subdirectory(03_gemm) +add_subdirectory(04_img2col) diff --git a/include/ck_tile/core/container/array.hpp b/include/ck_tile/core/container/array.hpp index c272b01f54..b587793bc9 100644 --- a/include/ck_tile/core/container/array.hpp +++ b/include/ck_tile/core/container/array.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -236,6 +236,16 @@ CK_TILE_HOST_DEVICE constexpr bool operator!=(const array& a, const arr return !(a == b); } +template +CK_TILE_HOST_DEVICE constexpr auto to_array(const std::vector& x) +{ + array arr; + + static_for<0, N, 1>{}([&x, &arr](auto i) { arr(i) = x[i]; }); + + return arr; +} + template CK_TILE_HOST_DEVICE constexpr auto to_array(const X& x) { diff --git a/include/ck_tile/host.hpp b/include/ck_tile/host.hpp index deebe90bf7..b382710b19 100644 --- a/include/ck_tile/host.hpp +++ b/include/ck_tile/host.hpp @@ -5,6 +5,8 @@ #include "ck_tile/host/arg_parser.hpp" #include "ck_tile/host/check_err.hpp" +#include "ck_tile/host/convolution_host_tensor_descriptor_helper.hpp" +#include "ck_tile/host/convolution_parameter.hpp" #include "ck_tile/host/device_memory.hpp" #include "ck_tile/host/fill.hpp" #include "ck_tile/host/hip_check_error.hpp" diff --git a/include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp b/include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp new file mode 100644 index 0000000000..b7317fc04b --- /dev/null +++ b/include/ck_tile/host/convolution_host_tensor_descriptor_helper.hpp @@ -0,0 +1,266 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/common/tensor_layout.hpp" +#include "ck_tile/host/convolution_parameter.hpp" +#include "ck_tile/host/host_tensor.hpp" + +namespace ck_tile { +namespace conv { +namespace detail { + +template +CK_TILE_HOST std::vector get_layout_transpose_gnchw_to_old() +{ + if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + return {0, 1, 2, 3}; + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + return {0, 1, 2, 3, 4}; + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + return {0, 1, 2, 3, 4, 5}; + } + if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + return {0, 1, 3, 2}; + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + return {0, 1, 4, 2, 3}; + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + return {0, 1, 5, 2, 3, 4}; + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + return {2, 0, 3, 1}; + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + return {3, 0, 4, 1, 2}; + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + return {4, 0, 5, 1, 2, 3}; + } + else + { + printf("%s\n", __func__); + throw std::runtime_error("wrong! unsupported layout"); + } +} + +} // namespace detail + +// make tensor descriptor for packed input tensor, and order the dimension in the order of GNCHW +// regardless of physical layout +template +CK_TILE_HOST HostTensorDescriptor +make_input_host_tensor_descriptor_g_n_c_wis_packed(const ck_tile::conv::ConvParam& param) +{ + std::vector physical_lengths; + + if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + physical_lengths = std::vector{static_cast(param.G_), + static_cast(param.N_), + static_cast(param.C_)}; + + physical_lengths.insert(physical_lengths.end(), + param.input_spatial_lengths_.begin(), + param.input_spatial_lengths_.begin() + param.num_dim_spatial_); + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + physical_lengths = std::vector{static_cast(param.G_), + static_cast(param.N_), + static_cast(param.C_)}; + + physical_lengths.insert(physical_lengths.begin() + 2, + param.input_spatial_lengths_.begin(), + param.input_spatial_lengths_.begin() + param.num_dim_spatial_); + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + physical_lengths = std::vector{static_cast(param.N_), + static_cast(param.G_), + static_cast(param.C_)}; + + physical_lengths.insert(physical_lengths.begin() + 1, + param.input_spatial_lengths_.begin(), + param.input_spatial_lengths_.begin() + param.num_dim_spatial_); + } + else + { + printf("%s\n", __func__); + printf("%s\n", InLayout::name); + throw std::runtime_error("wrong! unsupported layout"); + } + + return transpose_host_tensor_descriptor_given_new2old( + HostTensorDescriptor(physical_lengths), + detail::get_layout_transpose_gnchw_to_old()); +} + +// make tensor descriptor for packed weight tensor, and order the dimension in the order of GKCYX +// regardless of physical layout +template +CK_TILE_HOST HostTensorDescriptor +make_weight_host_tensor_descriptor_g_k_c_xs_packed(const ck_tile::conv::ConvParam& param) +{ + std::vector physical_lengths; + + if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + if(param.G_ != 1) + { + throw std::runtime_error("wrong! G != 1"); + } + + physical_lengths = std::vector{static_cast(param.K_), + static_cast(param.C_)}; + + physical_lengths.insert(physical_lengths.end(), + param.filter_spatial_lengths_.begin(), + param.filter_spatial_lengths_.begin() + param.num_dim_spatial_); + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + physical_lengths = std::vector{static_cast(param.G_), + static_cast(param.K_), + static_cast(param.C_)}; + + physical_lengths.insert(physical_lengths.end(), + param.filter_spatial_lengths_.begin(), + param.filter_spatial_lengths_.begin() + param.num_dim_spatial_); + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + physical_lengths = std::vector{static_cast(param.G_), + static_cast(param.K_), + static_cast(param.C_)}; + + physical_lengths.insert(physical_lengths.begin() + 2, + param.filter_spatial_lengths_.begin(), + param.filter_spatial_lengths_.begin() + param.num_dim_spatial_); + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + physical_lengths = std::vector{static_cast(param.K_), + static_cast(param.G_), + static_cast(param.C_)}; + + physical_lengths.insert(physical_lengths.begin() + 1, + param.filter_spatial_lengths_.begin(), + param.filter_spatial_lengths_.begin() + param.num_dim_spatial_); + } + else + { + printf("%s\n", __func__); + printf("%s\n", WeiLayout::name); + throw std::runtime_error("wrong! unsupported layout"); + } + + return transpose_host_tensor_descriptor_given_new2old( + HostTensorDescriptor(physical_lengths), + detail::get_layout_transpose_gnchw_to_old()); +} + +// make tensor descriptor for packed output tensor, and order the dimension in the order of GNKHW +// regardless of physical layout +template +CK_TILE_HOST HostTensorDescriptor +make_output_host_tensor_descriptor_g_n_k_wos_packed(const ck_tile::conv::ConvParam& param) +{ + std::vector physical_lengths; + + if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + physical_lengths = std::vector{static_cast(param.G_), + static_cast(param.N_), + static_cast(param.K_)}; + + physical_lengths.insert(physical_lengths.end(), + param.output_spatial_lengths_.begin(), + param.output_spatial_lengths_.begin() + param.num_dim_spatial_); + } + // separate from legacy code above + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + physical_lengths = std::vector{static_cast(param.G_), + static_cast(param.N_), + static_cast(param.K_)}; + + physical_lengths.insert(physical_lengths.begin() + 2, + param.output_spatial_lengths_.begin(), + param.output_spatial_lengths_.begin() + param.num_dim_spatial_); + } + else if constexpr(std::is_same_v || + std::is_same_v || + std::is_same_v) + { + physical_lengths = std::vector{static_cast(param.N_), + static_cast(param.G_), + static_cast(param.K_)}; + + physical_lengths.insert(physical_lengths.begin() + 1, + param.output_spatial_lengths_.begin(), + param.output_spatial_lengths_.begin() + param.num_dim_spatial_); + } + else + { + printf("%s\n", __func__); + printf("%s\n", OutLayout::name); + throw std::runtime_error("wrong! unsupported layout"); + } + + return transpose_host_tensor_descriptor_given_new2old( + HostTensorDescriptor(physical_lengths), + detail::get_layout_transpose_gnchw_to_old()); +} + +} // namespace conv +} // namespace ck_tile diff --git a/include/ck_tile/host/convolution_parameter.hpp b/include/ck_tile/host/convolution_parameter.hpp new file mode 100644 index 0000000000..741a25ad73 --- /dev/null +++ b/include/ck_tile/host/convolution_parameter.hpp @@ -0,0 +1,283 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +namespace ck_tile { +namespace conv { + +struct ConvParam +{ + ConvParam(); + ConvParam(ck_tile::index_t n_dim, + ck_tile::index_t group_count, + ck_tile::index_t n_batch, + ck_tile::index_t n_out_channels, + ck_tile::index_t n_in_channels, + const std::vector& filters_len, + const std::vector& input_len, + const std::vector& strides, + const std::vector& dilations, + const std::vector& left_pads, + const std::vector& right_pads) + : num_dim_spatial_(static_cast(n_dim)), + G_(static_cast(group_count)), + N_(static_cast(n_batch)), + K_(static_cast(n_out_channels)), + C_(static_cast(n_in_channels)), + filter_spatial_lengths_(num_dim_spatial_), + input_spatial_lengths_(num_dim_spatial_), + output_spatial_lengths_(num_dim_spatial_), + conv_filter_strides_(num_dim_spatial_), + conv_filter_dilations_(num_dim_spatial_), + input_left_pads_(num_dim_spatial_), + input_right_pads_(num_dim_spatial_) + { + if(static_cast(filter_spatial_lengths_.size()) != num_dim_spatial_ || + static_cast(input_spatial_lengths_.size()) != num_dim_spatial_ || + static_cast(conv_filter_strides_.size()) != num_dim_spatial_ || + static_cast(conv_filter_dilations_.size()) != num_dim_spatial_ || + static_cast(input_left_pads_.size()) != num_dim_spatial_ || + static_cast(input_right_pads_.size()) != num_dim_spatial_) + { + throw(std::runtime_error( + "ConvParam::ConvParam: " + "parameter size is different from number of declared dimensions!")); + } + + for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i) + { + filter_spatial_lengths_[i] = static_cast(filters_len[i]); + input_spatial_lengths_[i] = static_cast(input_len[i]); + conv_filter_strides_[i] = static_cast(strides[i]); + conv_filter_dilations_[i] = static_cast(dilations[i]); + input_left_pads_[i] = static_cast(left_pads[i]); + input_right_pads_[i] = static_cast(right_pads[i]); + + // XEff = (X - 1) * conv_dilation_w + 1; + // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + const ck_tile::long_index_t x_eff = + (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1; + + output_spatial_lengths_[i] = + (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) / + conv_filter_strides_[i] + + 1; + } + } + + ConvParam(ck_tile::long_index_t n_dim, + ck_tile::long_index_t group_count, + ck_tile::long_index_t n_batch, + ck_tile::long_index_t n_out_channels, + ck_tile::long_index_t n_in_channels, + const std::vector& filters_len, + const std::vector& input_len, + const std::vector& strides, + const std::vector& dilations, + const std::vector& left_pads, + const std::vector& right_pads) + : num_dim_spatial_(n_dim), + G_(group_count), + N_(n_batch), + K_(n_out_channels), + C_(n_in_channels), + filter_spatial_lengths_(filters_len), + input_spatial_lengths_(input_len), + output_spatial_lengths_(num_dim_spatial_), + conv_filter_strides_(strides), + conv_filter_dilations_(dilations), + input_left_pads_(left_pads), + input_right_pads_(right_pads) + { + if(static_cast(filter_spatial_lengths_.size()) != num_dim_spatial_ || + static_cast(input_spatial_lengths_.size()) != num_dim_spatial_ || + static_cast(conv_filter_strides_.size()) != num_dim_spatial_ || + static_cast(conv_filter_dilations_.size()) != num_dim_spatial_ || + static_cast(input_left_pads_.size()) != num_dim_spatial_ || + static_cast(input_right_pads_.size()) != num_dim_spatial_) + { + throw(std::runtime_error( + "ConvParam::ConvParam: " + "parameter size is different from number of declared dimensions!")); + } + + for(ck_tile::index_t i = 0; i < num_dim_spatial_; ++i) + { + // XEff = (X - 1) * conv_dilation_w + 1; + // Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1; + const ck_tile::long_index_t x_eff = + (filter_spatial_lengths_[i] - 1) * conv_filter_dilations_[i] + 1; + + output_spatial_lengths_[i] = + (input_spatial_lengths_[i] + input_left_pads_[i] + input_right_pads_[i] - x_eff) / + conv_filter_strides_[i] + + 1; + } + } + + ck_tile::long_index_t num_dim_spatial_; + ck_tile::long_index_t G_; + ck_tile::long_index_t N_; + ck_tile::long_index_t K_; + ck_tile::long_index_t C_; + + std::vector filter_spatial_lengths_; + std::vector input_spatial_lengths_; + std::vector output_spatial_lengths_; + + std::vector conv_filter_strides_; + std::vector conv_filter_dilations_; + + std::vector input_left_pads_; + std::vector input_right_pads_; + + std::vector GetOutputSpatialLengths() const + { + return output_spatial_lengths_; + } + + std::size_t GetFlops() const + { + // 2 * G * N * K * C * * + return static_cast(2) * G_ * N_ * K_ * C_ * + std::accumulate(std::begin(output_spatial_lengths_), + std::next(std::begin(output_spatial_lengths_), num_dim_spatial_), + 1, + std::multiplies<>()) * + std::accumulate(std::begin(filter_spatial_lengths_), + std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_), + 1, + std::multiplies<>()); + } + + template + std::size_t GetInputByte() const + { + // sizeof(InDataType) * (G * N * C * ) + + return sizeof(InDataType) * + (G_ * N_ * C_ * + std::accumulate(std::begin(input_spatial_lengths_), + std::next(std::begin(input_spatial_lengths_), num_dim_spatial_), + 1, + std::multiplies<>())); + } + + template + std::size_t GetWeightByte() const + { + // sizeof(WeiDataType) * (G * K * C * ) + + return sizeof(WeiDataType) * + (G_ * K_ * C_ * + std::accumulate(std::begin(filter_spatial_lengths_), + std::next(std::begin(filter_spatial_lengths_), num_dim_spatial_), + 1, + std::multiplies<>())); + } + + template + std::size_t GetOutputByte() const + { + // sizeof(OutDataType) * (G * N * K * ); + return sizeof(OutDataType) * (G_ * N_ * K_ * + std::accumulate(std::begin(output_spatial_lengths_), + std::end(output_spatial_lengths_), + static_cast(1), + std::multiplies())); + } + + template + std::size_t GetByte() const + { + return GetInputByte() + GetWeightByte() + + GetOutputByte(); + } +}; + +ConvParam::ConvParam() + : ConvParam::ConvParam(2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}) +{ +} + +CK_TILE_HOST std::string get_conv_param_parser_helper_msg() +{ + std::string msg; + + msg += "Following arguments (depending on number of spatial dims):\n" + " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + " G, N, K, C, \n" + " , (ie Y, X for 2D)\n" + " , (ie Hi, Wi for 2D)\n" + " , (ie Sy, Sx for 2D)\n" + " , (ie Dy, Dx for 2D)\n" + " , (ie LeftPy, LeftPx for 2D)\n" + " , (ie RightPy, RightPx for 2D)\n"; + + return msg; +} + +CK_TILE_HOST ck_tile::conv::ConvParam +parse_conv_param(int num_dim_spatial, int arg_idx, char* const argv[]) +{ + const ck_tile::long_index_t G = std::stol(argv[arg_idx++]); + const ck_tile::long_index_t N = std::stol(argv[arg_idx++]); + const ck_tile::long_index_t K = std::stol(argv[arg_idx++]); + const ck_tile::long_index_t C = std::stol(argv[arg_idx++]); + + std::vector filter_spatial_lengths(num_dim_spatial); + std::vector input_spatial_lengths(num_dim_spatial); + std::vector conv_filter_strides(num_dim_spatial); + std::vector conv_filter_dilations(num_dim_spatial); + std::vector input_left_pads(num_dim_spatial); + std::vector input_right_pads(num_dim_spatial); + + for(int i = 0; i < num_dim_spatial; ++i) + { + filter_spatial_lengths[i] = std::stol(argv[arg_idx++]); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_spatial_lengths[i] = std::stol(argv[arg_idx++]); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + conv_filter_strides[i] = std::stol(argv[arg_idx++]); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + conv_filter_dilations[i] = std::stol(argv[arg_idx++]); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_left_pads[i] = std::stol(argv[arg_idx++]); + } + + for(int i = 0; i < num_dim_spatial; ++i) + { + input_right_pads[i] = std::stol(argv[arg_idx++]); + } + + return ck_tile::conv::ConvParam{num_dim_spatial, + G, + N, + K, + C, + filter_spatial_lengths, + input_spatial_lengths, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads}; +} + +} // namespace conv +} // namespace ck_tile diff --git a/include/ck_tile/host/host_tensor.hpp b/include/ck_tile/host/host_tensor.hpp index 918abc69cc..f533d5c189 100644 --- a/include/ck_tile/host/host_tensor.hpp +++ b/include/ck_tile/host/host_tensor.hpp @@ -176,7 +176,20 @@ struct HostTensorDescriptor return std::inner_product(iss.begin(), iss.end(), mStrides.begin(), std::size_t{0}); } - friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc); + friend std::ostream& operator<<(std::ostream& os, const HostTensorDescriptor& desc) + { + os << "dim " << desc.get_num_of_dimension() << ", "; + + os << "lengths {"; + LogRange(os, desc.get_lengths(), ", "); + os << "}, "; + + os << "strides {"; + LogRange(os, desc.get_strides(), ", "); + os << "}"; + + return os; + } private: std::vector mLens; diff --git a/include/ck_tile/host/reference/reference_im2col.hpp b/include/ck_tile/host/reference/reference_im2col.hpp index 410140daa6..392d6abd47 100644 --- a/include/ck_tile/host/reference/reference_im2col.hpp +++ b/include/ck_tile/host/reference/reference_im2col.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -9,53 +9,125 @@ namespace ck_tile { -template -CK_TILE_HOST void reference_im2col(HostTensor& in_mtx_host_ref, - const HostTensor& in_host, - int /*N*/, - int /*K*/, - int C, - int /*Y*/, - int X, - int Hi, - int Wi, - int Ho, - int Wo, - int ConvStrideH, - int ConvStrideW, - int ConvDilationH, - int ConvDilationW, - int InLeftPadH, - int InLeftPadW, - int /*InRightPadH*/, - int /*InRightPadW*/) +template +CK_TILE_HOST void reference_im2col(const HostTensor& in_host, + HostTensor& out_host, + const ck_tile::conv::ConvParam& conv_params) { - int GemmM = in_mtx_host_ref.get_lengths()[0]; - int GemmK = in_mtx_host_ref.get_lengths()[1]; + const long_index_t G = in_host.get_lengths()[0]; + const long_index_t N = in_host.get_lengths()[1]; + const long_index_t C = in_host.get_lengths()[2]; - for(int gemm_m = 0; gemm_m < GemmM; ++gemm_m) + if constexpr(NDimSpatial == 1) { - int mtmp = gemm_m; - int n = mtmp / (Ho * Wo); - mtmp -= n * Ho * Wo; - int ho = mtmp / Wo; - int wo = mtmp - ho * Wo; + const long_index_t Wo = conv_params.output_spatial_lengths_[0]; + auto func = [&](auto g, auto n, auto wo) { + long_index_t row = n * Wo + wo; + long_index_t column = 0; - for(int gemm_k = 0; gemm_k < GemmK; ++gemm_k) - { - int ktmp = gemm_k; - int y = ktmp / (X * C); - ktmp -= y * X * C; - int x = ktmp / C; - int c = ktmp - x * C; + for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[0]; ++x) + { + auto wi = static_cast(wo * conv_params.conv_filter_strides_[0]) + + static_cast(x * conv_params.conv_filter_dilations_[0]) - + static_cast(conv_params.input_left_pads_[0]); - int hi = y * ConvDilationH + ho * ConvStrideH - InLeftPadH; - int wi = x * ConvDilationW + wo * ConvStrideW - InLeftPadW; + for(long_index_t c = 0; c < C; ++c) + { + if(wi >= 0 && type_convert(wi) < in_host.get_lengths()[3]) + { + InDataType v_in = in_host(g, n, c, wi); + out_host(g, row, column) = type_convert(v_in); + } + column++; + } + } + }; - bool inbound = (hi >= 0 && hi < Hi && wi >= 0 && wi < Wi); + make_ParallelTensorFunctor(func, G, N, Wo)(std::thread::hardware_concurrency()); + } + else if constexpr(NDimSpatial == 2) + { + const long_index_t Ho = conv_params.output_spatial_lengths_[0]; + const long_index_t Wo = conv_params.output_spatial_lengths_[1]; - in_mtx_host_ref(gemm_m, gemm_k) = inbound ? in_host(n, hi, wi, c) : 0; - } + auto func = [&](auto g, auto n, auto ho, auto wo) { + long_index_t row = n * Ho * Wo + ho * Wo + wo; + long_index_t column = 0; + + for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[0]; ++y) + { + auto hi = static_cast(ho * conv_params.conv_filter_strides_[0]) + + static_cast(y * conv_params.conv_filter_dilations_[0]) - + static_cast(conv_params.input_left_pads_[0]); + + for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[1]; ++x) + { + auto wi = static_cast(wo * conv_params.conv_filter_strides_[1]) + + static_cast(x * conv_params.conv_filter_dilations_[1]) - + static_cast(conv_params.input_left_pads_[1]); + + for(long_index_t c = 0; c < C; ++c) + { + + if(hi >= 0 && type_convert(hi) < in_host.get_lengths()[3] && + wi >= 0 && type_convert(wi) < in_host.get_lengths()[4]) + { + InDataType v_in = in_host(g, n, c, hi, wi); + out_host(g, row, column) = type_convert(v_in); + } + column++; + } + } + } + }; + + make_ParallelTensorFunctor(func, G, N, Ho, Wo)(std::thread::hardware_concurrency()); + } + else if constexpr(NDimSpatial == 3) + { + const long_index_t Do = conv_params.output_spatial_lengths_[0]; + const long_index_t Ho = conv_params.output_spatial_lengths_[1]; + const long_index_t Wo = conv_params.output_spatial_lengths_[2]; + + auto func = [&](auto g, auto n, auto d_o, auto ho, auto wo) { + long_index_t row = n * Do * Ho * Wo + d_o * Ho * Wo + ho * Wo + wo; + long_index_t column = 0; + + for(long_index_t z = 0; z < conv_params.filter_spatial_lengths_[0]; ++z) + { + auto di = static_cast(d_o * conv_params.conv_filter_strides_[0]) + + static_cast(z * conv_params.conv_filter_dilations_[0]) - + static_cast(conv_params.input_left_pads_[0]); + for(long_index_t y = 0; y < conv_params.filter_spatial_lengths_[1]; ++y) + { + auto hi = static_cast(ho * conv_params.conv_filter_strides_[1]) + + static_cast(y * conv_params.conv_filter_dilations_[1]) - + static_cast(conv_params.input_left_pads_[1]); + for(long_index_t x = 0; x < conv_params.filter_spatial_lengths_[2]; ++x) + { + auto wi = + static_cast(wo * conv_params.conv_filter_strides_[2]) + + static_cast(x * conv_params.conv_filter_dilations_[2]) - + static_cast(conv_params.input_left_pads_[2]); + for(long_index_t c = 0; c < C; ++c) + { + if(di >= 0 && + type_convert(di) < in_host.get_lengths()[3] && + hi >= 0 && + type_convert(hi) < in_host.get_lengths()[4] && + wi >= 0 && type_convert(wi) < in_host.get_lengths()[5]) + { + InDataType v_in = in_host(g, n, c, di, hi, wi); + out_host(g, row, column) = type_convert(v_in); + } + column++; + } + } + } + } + }; + + make_ParallelTensorFunctor(func, G, N, Do, Ho, Wo)(std::thread::hardware_concurrency()); } } } // namespace ck_tile diff --git a/include/ck_tile/ops/image_to_column.hpp b/include/ck_tile/ops/image_to_column.hpp new file mode 100644 index 0000000000..ea92f7ba2a --- /dev/null +++ b/include/ck_tile/ops/image_to_column.hpp @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp" +#include "ck_tile/ops/image_to_column/pipeline/block_image_to_column_problem.hpp" +#include "ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp" diff --git a/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp new file mode 100644 index 0000000000..ee74f1588f --- /dev/null +++ b/include/ck_tile/ops/image_to_column/kernel/image_to_column_kernel.hpp @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/ops/common.hpp" + +namespace ck_tile { + +template +struct ImageToColumn +{ + static constexpr auto I0 = number<0>{}; + static constexpr auto I1 = number<1>{}; + static constexpr auto I2 = number<2>{}; + static constexpr auto I3 = number<3>{}; + static constexpr auto I4 = number<4>{}; + + using Problem = remove_cvref_t; + + using InDataType = remove_cvref_t; + using OutDataType = remove_cvref_t; + + static constexpr index_t NDimSpatial = Problem::NDimSpatial; + + static constexpr index_t AligmentIn = Problem::AligmentIn; + static constexpr index_t AligmentOut = Problem::AligmentOut; + + static_assert(NDimSpatial == 2, "Not supported."); + + static constexpr index_t kMPerBlock = Problem::BlockShape::kMPerBlock; + static constexpr index_t kKPerBlock = Problem::BlockShape::kKPerBlock; + + struct Kargs + { + const void* p_in; + void* p_out; + + const long_index_t G; + const long_index_t N; + const long_index_t C; + + const array input_spatial_lengths; + const array filter_spatial_lengths; + const array output_spatial_lengths; + const array image_g_n_c_wis_strides; + const array gemm_g_m_k_strides; + const array conv_filter_strides; + const array conv_filter_dilations; + const array input_left_pads; + const array input_right_pads; + }; + + CK_TILE_HOST static constexpr Kargs + MakeKargs(const void* p_in, + void* p_out, + const long_index_t G, + const long_index_t N, + const long_index_t C, + const array input_spatial_lengths, + const array filter_spatial_lengths, + const array output_spatial_lengths, + const array image_g_n_c_wis_strides, + const array gemm_g_m_k_strides, + const array conv_filter_strides, + const array conv_filter_dilations, + const array input_left_pads, + const array input_right_pads) + { + return Kargs{p_in, + p_out, + G, + N, + C, + input_spatial_lengths, + filter_spatial_lengths, + output_spatial_lengths, + image_g_n_c_wis_strides, + gemm_g_m_k_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads}; + } + + CK_TILE_HOST static constexpr auto GridSize(index_t GemmM, index_t GemmK, index_t Batch) + { + return dim3( + integer_divide_ceil(GemmM, kMPerBlock), integer_divide_ceil(GemmK, kKPerBlock), Batch); + } + + CK_TILE_HOST static constexpr auto BlockSize() { return Problem::BlockShape::kBlockSize; } + + CK_TILE_DEVICE auto MakeImageMKDesc(const Kargs& kargs) const + { + static_assert(NDimSpatial == 2, "Not supported."); + + const auto in_n_hi_wi_c_desc = make_naive_tensor_descriptor( + make_tuple( + kargs.N, kargs.input_spatial_lengths[I0], kargs.input_spatial_lengths[I1], kargs.C), + make_tuple(kargs.image_g_n_c_wis_strides[I1], + kargs.image_g_n_c_wis_strides[I3], + kargs.image_g_n_c_wis_strides[I4], + kargs.image_g_n_c_wis_strides[I2]), + number{}, + I1); + + const auto in_n_hip_wip_c_desc = transform_tensor_descriptor( + in_n_hi_wi_c_desc, + make_tuple(make_pass_through_transform(kargs.N), + make_pad_transform(kargs.input_spatial_lengths[I0], + kargs.input_left_pads[I0], + kargs.input_right_pads[I0]), + make_pad_transform(kargs.input_spatial_lengths[I1], + kargs.input_left_pads[I1], + kargs.input_right_pads[I1]), + make_pass_through_transform(kargs.C)), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{})); + + const auto in_n_y_ho_x_wo_c_desc = transform_tensor_descriptor( + in_n_hip_wip_c_desc, + make_tuple( + make_pass_through_transform(kargs.N), + make_embed_transform( + make_tuple(kargs.filter_spatial_lengths[I0], kargs.output_spatial_lengths[I0]), + make_tuple(kargs.conv_filter_dilations[I0], kargs.conv_filter_strides[I0])), + make_embed_transform( + make_tuple(kargs.filter_spatial_lengths[I1], kargs.output_spatial_lengths[I1]), + make_tuple(kargs.conv_filter_dilations[I1], kargs.conv_filter_strides[I1])), + make_pass_through_transform(kargs.C)), + make_tuple(sequence<0>{}, sequence<1>{}, sequence<2>{}, sequence<3>{}), + make_tuple(sequence<0>{}, sequence<1, 2>{}, sequence<3, 4>{}, sequence<5>{})); + + return transform_tensor_descriptor( + in_n_y_ho_x_wo_c_desc, + make_tuple( + make_merge_transform(make_tuple( + kargs.N, kargs.output_spatial_lengths[I0], kargs.output_spatial_lengths[I1])), + make_merge_transform(make_tuple( + kargs.filter_spatial_lengths[I0], kargs.filter_spatial_lengths[I1], kargs.C))), + make_tuple(sequence<0, 2, 4>{}, sequence<1, 3, 5>{}), + make_tuple(sequence<0>{}, sequence<1>{})); + } + + CK_TILE_DEVICE auto CalculateMKDims(const Kargs& kargs) const + { + static_assert(NDimSpatial == 2, "Not supported."); + const index_t M = kargs.N * static_cast(kargs.output_spatial_lengths[I0] * + kargs.output_spatial_lengths[I1]); + const index_t K = kargs.C * static_cast(kargs.filter_spatial_lengths[I0] * + kargs.filter_spatial_lengths[I1]); + return make_tuple(M, K); + } + + CK_TILE_DEVICE static constexpr auto MakeBlockTileDistribution() + { + using P = typename Problem::BlockShape; + // P: {kMWarpPerBlock * kKWarpPerBlock, kMThreadPerWarp * kKThreadPerWarp} + // Y: {kMPerThread, kKPerThread} + return make_static_tile_distribution( + tile_distribution_encoding< + sequence<1>, + tuple, + sequence>, + tuple, sequence<1, 2>>, + tuple, sequence<1, 1>>, + sequence<1, 2>, + sequence<2, 2>>{}); + } + + CK_TILE_DEVICE void ConvTensorRearrange(const Kargs& kargs) const + { + const auto [M, K] = CalculateMKDims(kargs); + + const index_t iM = __builtin_amdgcn_readfirstlane(blockIdx.x * kMPerBlock); + const index_t iK = __builtin_amdgcn_readfirstlane(blockIdx.y * kKPerBlock); + const index_t iBatch = __builtin_amdgcn_readfirstlane(blockIdx.z); + + const auto in_offset = iBatch * kargs.image_g_n_c_wis_strides[I0]; + const auto out_offset = iBatch * kargs.gemm_g_m_k_strides[I0]; + + const auto image_m_k = make_tensor_view( + static_cast(kargs.p_in) + in_offset, MakeImageMKDesc(kargs)); + const auto gemm_m_k = make_naive_tensor_view( + static_cast(kargs.p_out) + out_offset, + make_tuple(M, K), + make_tuple(kargs.gemm_g_m_k_strides[I1], kargs.gemm_g_m_k_strides[I2]), + number{}, + I1); + + const auto image_m_k_padded = + pad_tensor_view(image_m_k, + make_tuple(number{}, number{}), + sequence{}); + const auto gemm_m_k_padded = + pad_tensor_view(gemm_m_k, + make_tuple(number{}, number{}), + sequence{}); + + constexpr auto dstr = MakeBlockTileDistribution(); + + const auto image_tile = + make_tile_window(image_m_k_padded, + make_tuple(number{}, number{}), + {iM, iK}, + dstr); + + auto gemm_tile = make_tile_window(gemm_m_k_padded, + make_tuple(number{}, number{}), + {iM, iK}, + dstr); + + // load from Global + const auto loaded_tile = load_tile(image_tile); + // save to Global + store_tile(gemm_tile, loaded_tile); + } + + CK_TILE_DEVICE void operator()(Kargs& kargs) const { ConvTensorRearrange(kargs); } +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/image_to_column/pipeline/block_image_to_column_problem.hpp b/include/ck_tile/ops/image_to_column/pipeline/block_image_to_column_problem.hpp new file mode 100644 index 0000000000..8d50ffde6d --- /dev/null +++ b/include/ck_tile/ops/image_to_column/pipeline/block_image_to_column_problem.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core/utility/type_traits.hpp" + +namespace ck_tile { + +template +struct BlockImageToColumnProblem +{ + using InDataType = remove_cvref_t; + using OutDataType = remove_cvref_t; + using BlockShape = remove_cvref_t; + + static constexpr index_t NDimSpatial = NDimSpatial_; + static constexpr index_t AligmentIn = AligmentIn_; + static constexpr index_t AligmentOut = AligmentOut_; +}; + +} // namespace ck_tile diff --git a/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp b/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp new file mode 100644 index 0000000000..b038472fcf --- /dev/null +++ b/include/ck_tile/ops/image_to_column/pipeline/tile_image_to_column_shape.hpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck_tile/core.hpp" + +namespace ck_tile { +template // Sequence<... +struct TileImageToColumnShape +{ + static constexpr index_t kMPerThread = ThreadTile::at(number<0>{}); + static constexpr index_t kKPerThread = ThreadTile::at(number<1>{}); + + static constexpr index_t kMPerWarp = WarpTile::at(number<0>{}); + static constexpr index_t kKPerWarp = WarpTile::at(number<1>{}); + + static constexpr index_t kMThreadPerWarp = kMPerWarp / kMPerThread; + static constexpr index_t kKThreadPerWarp = kKPerWarp / kKPerThread; + + static constexpr index_t kMPerBlock = BlockTile::at(number<0>{}); + static constexpr index_t kKPerBlock = BlockTile::at(number<1>{}); + + static constexpr index_t kMWarpPerBlock = kMPerBlock / kMPerWarp; + static constexpr index_t kKWarpPerBlock = kKPerBlock / kKPerWarp; + + static constexpr index_t kBlockSize = warpSize * kMWarpPerBlock * kKWarpPerBlock; +}; + +} // namespace ck_tile diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 71bde7e267..e61d937f08 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -173,6 +173,7 @@ function(add_gtest_executable TEST_NAME) endfunction() add_compile_options(-Wno-c++20-extensions) +add_subdirectory(ck_tile) add_subdirectory(magic_number_division) add_subdirectory(space_filling_curve) add_subdirectory(conv_util) diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt new file mode 100644 index 0000000000..9075ca2ed0 --- /dev/null +++ b/test/ck_tile/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(image_to_column) diff --git a/test/ck_tile/image_to_column/CMakeLists.txt b/test/ck_tile/image_to_column/CMakeLists.txt new file mode 100644 index 0000000000..247358dd4d --- /dev/null +++ b/test/ck_tile/image_to_column/CMakeLists.txt @@ -0,0 +1,4 @@ +# Currently ck_tile is only built on gfx9 +if(GPU_TARGETS MATCHES "gfx9") + add_gtest_executable(test_tile_image_to_column test_tile_image_to_column.cpp) +endif() diff --git a/test/ck_tile/image_to_column/test_tile_image_to_column.cpp b/test/ck_tile/image_to_column/test_tile_image_to_column.cpp new file mode 100644 index 0000000000..9c0746e972 --- /dev/null +++ b/test/ck_tile/image_to_column/test_tile_image_to_column.cpp @@ -0,0 +1,142 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include + +#include "ck_tile/host.hpp" +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/image_to_column.hpp" + +// Host API implementation +template +class TestCkTileImageToColumn : public ::testing::Test +{ + static constexpr ck_tile::index_t VectorSize = 1; + static constexpr ck_tile::index_t NDimSpatial = 2; + + protected: + void Run(const ck_tile::conv::ConvParam conv_params) + { + + using ImLayout = ck_tile::tensor_layout::convolution::NHWGC; + + const auto G = conv_params.G_; + const auto N = conv_params.N_; + const auto C = conv_params.C_; + + const ck_tile::long_index_t NDoHoWo = + N * std::accumulate(conv_params.output_spatial_lengths_.begin(), + std::next(conv_params.output_spatial_lengths_.begin(), NDimSpatial), + 1, + std::multiplies<>()); + + const ck_tile::long_index_t CZYX = + C * std::accumulate(conv_params.filter_spatial_lengths_.begin(), + std::next(conv_params.filter_spatial_lengths_.begin(), NDimSpatial), + 1, + std::multiplies<>()); + + const auto in_desc = + ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed( + conv_params); + const auto out_desc = ck_tile::HostTensorDescriptor({G, NDoHoWo, CZYX}); + + // host verify + ck_tile::HostTensor in(in_desc); + ck_tile::HostTensor out_device(out_desc); + ck_tile::HostTensor out_host(out_desc); + + std::cout << "input: " << in.mDesc << std::endl; + std::cout << "output: " << out_device.mDesc << std::endl; + + ck_tile::FillUniformDistributionIntegerValue{-5.f, 5.f}(in); + + ck_tile::DeviceMem in_device_buf(in.get_element_space_size_in_bytes()); + ck_tile::DeviceMem out_device_buf(out_device.get_element_space_size_in_bytes()); + + in_device_buf.ToDevice(in.data()); + + using thread_tile = ck_tile::sequence<4, 4>; + using warp_tile = ck_tile::sequence<8, 128>; + using block_tile = ck_tile::sequence<32, 128>; + + using Shape = ck_tile::TileImageToColumnShape; + + using PipelineProblem = ck_tile::BlockImageToColumnProblem; + + using Kernel = ck_tile::ImageToColumn; + + auto kargs = Kernel::MakeKargs( + in_device_buf.GetDeviceBuffer(), + out_device_buf.GetDeviceBuffer(), + G, + N, + C, + ck_tile::to_array( + conv_params.input_spatial_lengths_), + ck_tile::to_array( + conv_params.filter_spatial_lengths_), + ck_tile::to_array( + conv_params.output_spatial_lengths_), + ck_tile::to_array(in_desc.get_strides()), + ck_tile::to_array(out_desc.get_strides()), + ck_tile::to_array(conv_params.conv_filter_strides_), + ck_tile::to_array( + conv_params.conv_filter_dilations_), + ck_tile::to_array(conv_params.input_left_pads_), + ck_tile::to_array(conv_params.input_right_pads_)); + + const dim3 grids = Kernel::GridSize( + kargs.N * kargs.output_spatial_lengths[0] * kargs.output_spatial_lengths[1], + kargs.filter_spatial_lengths[0] * kargs.filter_spatial_lengths[1] * kargs.C, + kargs.G); + constexpr dim3 blocks = Kernel::BlockSize(); + + constexpr ck_tile::index_t kBlockPerCu = 2; + + ck_tile::launch_kernel( + ck_tile::stream_config{}, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); + + // reference + ck_tile::reference_im2col(in, out_host, conv_params); + + out_device_buf.FromDevice(out_device.data()); + bool pass = ck_tile::check_err(out_device, out_host); + + EXPECT_TRUE(pass); + } +}; + +class TestCkTileImageToColumnFloat : public TestCkTileImageToColumn +{ +}; + +class TestCkTileImageToColumnHalf : public TestCkTileImageToColumn +{ +}; + +TEST_F(TestCkTileImageToColumnFloat, TestCorrectness) +{ + this->Run({2, 2, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->Run({2, 2, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->Run({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}}); + this->Run({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); + this->Run({2, 2, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}}); +} + +TEST_F(TestCkTileImageToColumnHalf, TestCorrectness) +{ + this->Run({2, 2, 4, 1, 192, {3, 3}, {28, 28}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->Run({2, 2, 64, 1, 64, {3, 3}, {14, 14}, {1, 1}, {1, 1}, {1, 1}, {1, 1}}); + this->Run({2, 1, 64, 1, 64, {1, 1}, {7, 7}, {3, 3}, {1, 1}, {0, 0}, {0, 0}}); + this->Run({2, 1, 64, 1, 64, {1, 1}, {3, 3}, {1, 1}, {1, 1}, {0, 0}, {0, 0}}); + this->Run({2, 2, 64, 1, 64, {3, 3}, {28, 28}, {2, 2}, {2, 2}, {1, 1}, {1, 1}}); +}