From f49803101e566a11499e48c6c7183c2bf6380c78 Mon Sep 17 00:00:00 2001 From: Po Yen Chen Date: Thu, 10 Nov 2022 09:01:58 +0800 Subject: [PATCH] Add client example of grouped conv2d forward (data type: fp16) (#488) * Rename example folder for GroupedConvFwdMultipleD * Unify example codes * Change target names * Add fp16 example for multiple d instance * Re-format common.hpp * Add interface 'DeviceGroupedConvFwd' * Use simpler interface * Move common conv params out * Rename conv fwd client example folder * Add missing include directive * Update grouped conv instance implementations * Simplify ckProfiler (grouped conv forward) * Use GroupedConvFwd to implement client example * Use greater groupe count in example * Add custom target to group examples * Add extra tag param to instance factory function * Use tag to differentiate factory functions * Add missing tag argument for factory function * Remove inheritance relationship * Remove no-longer used include directive * Add license in front of file --- client_example/07_conv2d_fwd/CMakeLists.txt | 2 - .../07_grouped_conv2d_fwd/CMakeLists.txt | 2 + .../grouped_conv2d_fwd.cpp} | 155 ++++-- .../CMakeLists.txt | 22 + .../30_grouped_conv_fwd_multiple_d/README.md | 30 ++ .../30_grouped_conv_fwd_multiple_d/common.hpp | 354 ++++++++++++++ ...rouped_conv_fwd_bias_relu_add_xdl_bf16.cpp | 26 + ...rouped_conv_fwd_bias_relu_add_xdl_fp16.cpp | 26 + ...rouped_conv_fwd_bias_relu_add_xdl_fp32.cpp | 26 + ...rouped_conv_fwd_bias_relu_add_xdl_int4.cpp | 31 ++ ...rouped_conv_fwd_bias_relu_add_xdl_int8.cpp | 26 + .../grouped_conv_fwd_xdl_fp16.cpp | 24 + ...rouped_conv_fwd_bias_relu_add_example.inc} | 227 +++++---- .../run_grouped_conv_fwd_example.inc | 223 +++++++++ .../CMakeLists.txt | 11 - .../README.md | 34 -- ...uped_convnd_fwd_bias_relu_add_xdl_bf16.cpp | 459 ------------------ ...uped_convnd_fwd_bias_relu_add_xdl_fp16.cpp | 459 ------------------ ...uped_convnd_fwd_bias_relu_add_xdl_fp32.cpp | 459 ------------------ ...uped_convnd_fwd_bias_relu_add_xdl_int4.cpp | 459 ------------------ ...uped_convnd_fwd_bias_relu_add_xdl_int8.cpp | 459 ------------------ .../gpu/device/device_grouped_conv_fwd.hpp | 43 +- .../device_operation_instance_factory.hpp | 2 +- .../gpu/grouped_convolution_forward.hpp | 4 +- .../gpu/grouped_convolution_forward_dl.hpp | 4 +- .../include/profile_grouped_conv_fwd_impl.hpp | 44 +- 26 files changed, 1078 insertions(+), 2533 deletions(-) delete mode 100644 client_example/07_conv2d_fwd/CMakeLists.txt create mode 100644 client_example/07_grouped_conv2d_fwd/CMakeLists.txt rename client_example/{07_conv2d_fwd/conv2d_fwd.cpp => 07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp} (50%) create mode 100644 example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt create mode 100644 example/30_grouped_conv_fwd_multiple_d/README.md create mode 100644 example/30_grouped_conv_fwd_multiple_d/common.hpp create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp create mode 100644 example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp rename example/{30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp => 30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc} (59%) create mode 100644 example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc delete mode 100644 example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt delete mode 100644 example/30_grouped_convnd_fwd_bias_relu_add/README.md delete mode 100644 example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp delete mode 100644 example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp delete mode 100644 example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp delete mode 100644 example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp delete mode 100644 example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp diff --git a/client_example/07_conv2d_fwd/CMakeLists.txt b/client_example/07_conv2d_fwd/CMakeLists.txt deleted file mode 100644 index 4247731193..0000000000 --- a/client_example/07_conv2d_fwd/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -add_executable(client_conv2d_fwd conv2d_fwd.cpp) -target_link_libraries(client_conv2d_fwd PRIVATE composable_kernel::device_operations) diff --git a/client_example/07_grouped_conv2d_fwd/CMakeLists.txt b/client_example/07_grouped_conv2d_fwd/CMakeLists.txt new file mode 100644 index 0000000000..ddc83168ac --- /dev/null +++ b/client_example/07_grouped_conv2d_fwd/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_grouped_conv2d_fwd grouped_conv2d_fwd.cpp) +target_link_libraries(client_grouped_conv2d_fwd PRIVATE composable_kernel::device_operations) diff --git a/client_example/07_conv2d_fwd/conv2d_fwd.cpp b/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp similarity index 50% rename from client_example/07_conv2d_fwd/conv2d_fwd.cpp rename to client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp index 55aeac2de5..ece6e30c56 100644 --- a/client_example/07_conv2d_fwd/conv2d_fwd.cpp +++ b/client_example/07_grouped_conv2d_fwd/grouped_conv2d_fwd.cpp @@ -1,35 +1,38 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. +#include #include #include +#include +#include #include #include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/gpu/convolution_forward.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" using InDataType = ck::half_t; using WeiDataType = ck::half_t; using OutDataType = ck::half_t; -using InLayout = ck::tensor_layout::convolution::NHWC; -using WeiLayout = ck::tensor_layout::convolution::KYXC; -using OutLayout = ck::tensor_layout::convolution::NHWK; +using InLayout = ck::tensor_layout::convolution::GNHWC; +using WeiLayout = ck::tensor_layout::convolution::GKYXC; +using OutLayout = ck::tensor_layout::convolution::GNHWK; using PassThrough = ck::tensor_operation::element_wise::PassThrough; static constexpr ck::index_t NumDimSpatial = 2; -static constexpr ck::index_t N = 16; -static constexpr ck::index_t K = 32; -static constexpr ck::index_t C = 3; +static constexpr ck::index_t G = 32; +static constexpr ck::index_t N = 256; +static constexpr ck::index_t K = 192; +static constexpr ck::index_t C = 192; static constexpr ck::index_t Y = 3; static constexpr ck::index_t X = 3; -static constexpr ck::index_t Hi = 224; -static constexpr ck::index_t Wi = 224; -static constexpr ck::index_t Ho = 113; -static constexpr ck::index_t Wo = 113; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 28; struct SimpleDeviceMem { @@ -47,30 +50,66 @@ struct SimpleDeviceMem void* p_mem_; }; -int main(int argc, char* argv[]) +int main() { - std::vector in_spatial_lengths{Hi, Wi}; - std::vector filter_spatial_lengths{Y, X}; - std::vector out_spatial_lengths{Ho, Wo}; - std::vector filter_strides{2, 2}; - std::vector filter_dilations{1, 1}; - std::vector input_left_pads{2, 2}; - std::vector input_right_pads{2, 2}; + std::array in_lengths{G, N, Hi, Wi, C}; + std::array in_strides{0, 0, 0, 0, 1}; - SimpleDeviceMem in(sizeof(InDataType) * N * Hi * Wi * C); - SimpleDeviceMem wei(sizeof(WeiDataType) * K * Y * X * C); - SimpleDeviceMem out(sizeof(OutDataType) * N * Ho * Wo * K); + std::array wei_lengths{G, K, Y, X, C}; + std::array wei_strides{0, 0, 0, 0, 1}; + + std::array out_lengths{G, N, Ho, Wo, K}; + std::array out_strides{0, 0, 0, 0, 1}; + + std::partial_sum(rbegin(in_lengths), + std::prev(rend(in_lengths)), + std::next(rbegin(in_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(wei_lengths), + std::prev(rend(wei_lengths)), + std::next(rbegin(wei_strides)), + std::multiplies<>{}); + std::partial_sum(rbegin(out_lengths), + std::prev(rend(out_lengths)), + std::next(rbegin(out_strides)), + std::multiplies<>{}); + + // transpose GNHWC/GKYXC/GNHWK to GNCHW/GKCYX/GNCHW + std::rotate( + rbegin(in_lengths), std::next(rbegin(in_lengths)), std::next(rbegin(in_lengths), 3)); + std::rotate( + rbegin(in_strides), std::next(rbegin(in_strides)), std::next(rbegin(in_strides), 3)); + std::rotate( + rbegin(wei_lengths), std::next(rbegin(wei_lengths)), std::next(rbegin(wei_lengths), 3)); + std::rotate( + rbegin(wei_strides), std::next(rbegin(wei_strides)), std::next(rbegin(wei_strides), 3)); + std::rotate( + rbegin(out_lengths), std::next(rbegin(out_lengths)), std::next(rbegin(out_lengths), 3)); + std::rotate( + rbegin(out_strides), std::next(rbegin(out_strides)), std::next(rbegin(out_strides), 3)); + + std::array filter_strides{1, 1}; + std::array filter_dilations{1, 1}; + std::array input_left_pads{1, 1}; + std::array input_right_pads{1, 1}; + + SimpleDeviceMem in(sizeof(InDataType) * G * N * Hi * Wi * C); + SimpleDeviceMem wei(sizeof(WeiDataType) * G * K * Y * X * C); + SimpleDeviceMem out(sizeof(OutDataType) * G * N * Ho * Wo * K); + + using DeviceOp = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + PassThrough, + PassThrough, + PassThrough>; - using DeviceOp = ck::tensor_operation::device::DeviceConvFwd; // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); @@ -91,13 +130,16 @@ int main(int argc, char* argv[]) auto& op_ptr = op_ptrs[i]; auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), wei.GetDeviceBuffer(), + {}, out.GetDeviceBuffer(), - N, - K, - C, - in_spatial_lengths, - filter_spatial_lengths, - out_spatial_lengths, + in_lengths, + in_strides, + wei_lengths, + wei_strides, + {}, + {}, + out_lengths, + out_strides, filter_strides, filter_dilations, input_left_pads, @@ -112,10 +154,10 @@ int main(int argc, char* argv[]) { float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); - std::size_t flop = 2 * N * K * C * Ho * Wo * Y * X; - std::size_t num_bytes = sizeof(InDataType) * N * Hi * Wi * C + - sizeof(WeiDataType) * K * Y * X * C + - sizeof(OutDataType) * N * Ho * Wo * K; + std::size_t flop = std::size_t(2) * G * N * K * C * Ho * Wo * Y * X; + std::size_t num_bytes = sizeof(InDataType) * G * N * Hi * Wi * C + + sizeof(WeiDataType) * G * K * Y * X * C + + sizeof(OutDataType) * G * N * Ho * Wo * K; float tflops = static_cast(flop) / 1.E9 / avg_time; float gb_per_sec = num_bytes / 1.E6 / avg_time; @@ -134,10 +176,16 @@ int main(int argc, char* argv[]) } else { - std::cout << op_name << " does not support this problem" << std::endl; + std::cerr << op_name << " does not support this problem" << std::endl; } } + if(best_op_id < 0) + { + std::cerr << "no suitable instance" << std::endl; + return EXIT_FAILURE; + } + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; @@ -148,13 +196,16 @@ int main(int argc, char* argv[]) << std::endl; auto argument_ptr = op_ptr->MakeArgumentPointer(in.GetDeviceBuffer(), wei.GetDeviceBuffer(), + {}, out.GetDeviceBuffer(), - N, - K, - C, - in_spatial_lengths, - filter_spatial_lengths, - out_spatial_lengths, + in_lengths, + in_strides, + wei_lengths, + wei_strides, + {}, + {}, + out_lengths, + out_strides, filter_strides, filter_dilations, input_left_pads, @@ -172,6 +223,4 @@ int main(int argc, char* argv[]) std::cout << "Done" << std::endl; } - - return 0; -} \ No newline at end of file +} diff --git a/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt new file mode 100644 index 0000000000..61b2b2f6f3 --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/CMakeLists.txt @@ -0,0 +1,22 @@ +add_custom_target(example_grouped_conv_fwd_multiple_d) + +add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp16 grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp) +add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_fp32 grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp) +add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_bf16 grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp) +add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int8 grouped_conv_fwd_bias_relu_add_xdl_int8.cpp) + +add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp16) +add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_fp32) +add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_bf16) +add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int8) + +if(USE_BITINT_EXTENSION_INT4) + add_example_executable(example_grouped_conv_fwd_bias_relu_add_xdl_int4 grouped_conv_fwd_bias_relu_add_xdl_int4.cpp) + + add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_bias_relu_add_xdl_int4) +endif() # USE_BITINT_EXTENSION_INT4 + + +add_example_executable(example_grouped_conv_fwd_xdl_fp16 grouped_conv_fwd_xdl_fp16.cpp) + +add_dependencies(example_grouped_conv_fwd_multiple_d example_grouped_conv_fwd_xdl_fp16) diff --git a/example/30_grouped_conv_fwd_multiple_d/README.md b/example/30_grouped_conv_fwd_multiple_d/README.md new file mode 100644 index 0000000000..739a0425a8 --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/README.md @@ -0,0 +1,30 @@ +Command +```bash +arg1: verification (0=no, 1=yes) +arg2: initialization (0=no init, 1=integer value, 2=decimal value) +arg3: time kernel (0=no, 1=yes) +Following arguments (depending on number of spatial dims): + Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d) + G, N, K, C, + , (ie Y, X for 2D) + , (ie Hi, Wi for 2D) + , (ie Sy, Sx for 2D) + , (ie Dy, Dx for 2D) + , (ie LeftPy, LeftPx for 2D) + , (ie RightPy, RightPx for 2D) + +./bin/example_grouped_conv_fwd_bias_relu_add_xdl_fp16 1 1 1 +``` + +Result (MI100) +``` +in: dim 5, lengths {1, 128, 192, 71, 71}, strides {192, 967872, 1, 13632, 192} +wei: dim 5, lengths {1, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192} +bias: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0} +residual: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0} +out: dim 5, lengths {1, 128, 256, 36, 36}, strides {256, 331776, 1, 9216, 256} +launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1} +Warm up 1 time +Start running 10 times... +Perf: 1.55981 ms, 94.0927 TFlops, 213.868 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 16, Default> +``` diff --git a/example/30_grouped_conv_fwd_multiple_d/common.hpp b/example/30_grouped_conv_fwd_multiple_d/common.hpp new file mode 100644 index 0000000000..3995403607 --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/common.hpp @@ -0,0 +1,354 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp" +#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" + +using BF16 = ck::bhalf_t; +using FP16 = ck::half_t; +using FP32 = float; +#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +using I4 = ck::int4_t; +#endif +using I8 = std::int8_t; +using I32 = std::int32_t; + +template +using S = ck::Sequence; + +using PassThrough = ck::tensor_operation::element_wise::PassThrough; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +template +struct CommonLayoutSetting +{ + using InputLayout = InputLay; + using WeightLayout = WeightLay; + using OutputLayout = OutputLay; +}; + +template +struct CommonLayoutSettingSelector; + +namespace ctl = ck::tensor_layout::convolution; + +template <> +struct CommonLayoutSettingSelector<1> final + : CommonLayoutSetting +{ +}; + +template <> +struct CommonLayoutSettingSelector<2> final + : CommonLayoutSetting +{ +}; + +template <> +struct CommonLayoutSettingSelector<3> final + : CommonLayoutSetting +{ +}; + +template +using InputLayout = typename CommonLayoutSettingSelector::InputLayout; + +template +using WeightLayout = typename CommonLayoutSettingSelector::WeightLayout; + +template +using OutputLayout = typename CommonLayoutSettingSelector::OutputLayout; + +struct ExecutionConfig final +{ + bool do_verification = true; + int init_method = 1; + bool time_kernel = true; +}; + +#define DefaultConvParam \ + ck::utils::conv::ConvParam \ + { \ + 2, 32, 2, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, { 1, 1 } \ + } + +inline void print_help_msg() +{ + std::cerr << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: time kernel (0=no, 1=yes)\n" + << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; +} + +inline bool parse_cmd_args(int argc, + char* argv[], + ExecutionConfig& config, + ck::utils::conv::ConvParam& conv_param) +{ + constexpr int num_execution_config_args = + 3; // arguments for do_verification, init_method, time_kernel + constexpr int num_conv_param_leading_args = 5; // arguments for num_dim_spatial_, G_, N_, K_, C_ + + constexpr int threshold_to_catch_partial_args = 1 + num_execution_config_args; + constexpr int threshold_to_catch_all_args = + threshold_to_catch_partial_args + num_conv_param_leading_args; + + if(argc == 1) + { + // use default + } + // catch only ExecutionConfig arguments + else if(argc == threshold_to_catch_partial_args) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + } + // catch both ExecutionConfig & ConvParam arguments + else if(threshold_to_catch_all_args < argc && ((argc - threshold_to_catch_all_args) % 3 == 0)) + { + config.do_verification = std::stoi(argv[1]); + config.init_method = std::stoi(argv[2]); + config.time_kernel = std::stoi(argv[3]); + + const ck::index_t num_dim_spatial = std::stoi(argv[4]); + conv_param = ck::utils::conv::parse_conv_param( + num_dim_spatial, threshold_to_catch_partial_args, argv); + } + else + { + print_help_msg(); + return false; + } + + return true; +} + +inline HostTensorDescriptor make_input_descriptor(const ck::utils::conv::ConvParam& conv_param) +{ + switch(conv_param.num_dim_spatial_) + { + case 1: + return HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]}, + { + conv_param.C_, // g + conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n + 1, // c + conv_param.G_ * conv_param.C_ // wi + }); + + case 2: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.C_, + conv_param.input_spatial_lengths_[0], + conv_param.input_spatial_lengths_[1]}, + { + conv_param.C_, // g + conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * + conv_param.G_ * conv_param.C_, // n + 1, // c + conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi + conv_param.G_ * conv_param.C_ // wi + }); + + case 3: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.C_, + conv_param.input_spatial_lengths_[0], + conv_param.input_spatial_lengths_[1], + conv_param.input_spatial_lengths_[2]}, + { + conv_param.C_, // g + conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * + conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n + 1, // c + conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] * + conv_param.G_ * conv_param.C_, // di + conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi + conv_param.G_ * conv_param.C_ // wi + }); + } + + throw std::runtime_error("unsuppored # dim spatial"); +} + +inline HostTensorDescriptor make_weight_descriptor(const ck::utils::conv::ConvParam& conv_param) +{ + switch(conv_param.num_dim_spatial_) + { + case 1: + return HostTensorDescriptor( + {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]}, + { + conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g + conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k + 1, // c + conv_param.C_ // x + }); + case 2: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.K_, + conv_param.C_, + conv_param.filter_spatial_lengths_[0], + conv_param.filter_spatial_lengths_[1]}, + { + conv_param.K_ * conv_param.filter_spatial_lengths_[0] * + conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g + conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * + conv_param.C_, // k + 1, // c + conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y + conv_param.C_ // x + }); + case 3: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.K_, + conv_param.C_, + conv_param.filter_spatial_lengths_[0], + conv_param.filter_spatial_lengths_[1], + conv_param.filter_spatial_lengths_[2]}, + { + conv_param.K_ * conv_param.filter_spatial_lengths_[0] * + conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * + conv_param.C_, // g + conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * + conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k + 1, // c + conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * + conv_param.C_, // z + conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y + conv_param.C_ // x + }); + } + + throw std::runtime_error("unsuppored # dim spatial"); +} + +inline HostTensorDescriptor make_bias_descriptor(const ck::utils::conv::ConvParam& conv_param) +{ + switch(conv_param.num_dim_spatial_) + { + case 1: + return HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, + { + conv_param.K_, // g + 0, // k + 1, // c + 0 // x + }); + case 2: + return HostTensorDescriptor({conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1]}, + { + conv_param.K_, // g + 0, // n + 1, // k + 0, // ho + 0 // wo + }); + case 3: + return HostTensorDescriptor({conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1], + conv_param.output_spatial_lengths_[2]}, + { + conv_param.K_, // g + 0, // n + 1, // k + 0, // z + 0, // y + 0 // x + }); + } + + throw std::runtime_error("unsuppored # dim spatial"); +} + +inline HostTensorDescriptor make_output_descriptor(const ck::utils::conv::ConvParam& conv_param) +{ + + switch(conv_param.num_dim_spatial_) + { + case 1: + return HostTensorDescriptor( + {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, + { + conv_param.K_, // g + conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n + 1, // k + conv_param.G_ * conv_param.K_ // wo + }); + case 2: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1]}, + { + conv_param.K_, // g + conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * + conv_param.G_ * conv_param.K_, // n + 1, // k + conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho + conv_param.G_ * conv_param.K_ // wo + }); + + case 3: + return HostTensorDescriptor( + {conv_param.G_, + conv_param.N_, + conv_param.K_, + conv_param.output_spatial_lengths_[0], + conv_param.output_spatial_lengths_[1], + conv_param.output_spatial_lengths_[2]}, + { + conv_param.K_, // g + conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * + conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n + 1, // k + conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] * + conv_param.G_ * conv_param.K_, // do + conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho + conv_param.G_ * conv_param.K_ // wo + }); + } + + throw std::runtime_error("unsuppored # dim spatial"); +} diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp new file mode 100644 index 0000000000..ee300d073a --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_bf16.cpp @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +// kernel data types +using InKernelDataType = BF16; +using WeiKernelDataType = BF16; +using AccDataType = FP32; +using CShuffleDataType = FP32; +using BiasKernelDataType = BF16; +using ResidualKernelDataType = BF16; +using OutKernelDataType = BF16; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; + +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; + +#include "run_grouped_conv_fwd_bias_relu_add_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); } diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp new file mode 100644 index 0000000000..5a9df0b1e8 --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp16.cpp @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +// kernel data types +using InKernelDataType = FP16; +using WeiKernelDataType = FP16; +using AccDataType = FP32; +using CShuffleDataType = FP16; +using BiasKernelDataType = FP16; +using ResidualKernelDataType = FP16; +using OutKernelDataType = FP16; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; + +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; + +#include "run_grouped_conv_fwd_bias_relu_add_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); } diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp new file mode 100644 index 0000000000..c2906cc9dd --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_fp32.cpp @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +// kernel data types +using InKernelDataType = FP32; +using WeiKernelDataType = FP32; +using AccDataType = FP32; +using CShuffleDataType = FP32; +using BiasKernelDataType = FP32; +using ResidualKernelDataType = FP32; +using OutKernelDataType = FP32; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; + +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; + +#include "run_grouped_conv_fwd_bias_relu_add_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); } diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp new file mode 100644 index 0000000000..3d5a243e6b --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int4.cpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#ifndef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +#error Should compile this file with ck::int4_t support +#endif + +#include "common.hpp" + +// kernel data types +using InKernelDataType = I8; +using WeiKernelDataType = I8; +using AccDataType = I32; +using CShuffleDataType = I8; +using BiasKernelDataType = I8; +using ResidualKernelDataType = I8; +using OutKernelDataType = I8; + +// tensor data types +using InUserDataType = I4; +using WeiUserDataType = I4; +using OutUserDataType = I4; + +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; + +#define BUILD_INT4_EXAMPLE +#include "run_grouped_conv_fwd_bias_relu_add_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); } diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp new file mode 100644 index 0000000000..eaf680fa43 --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_bias_relu_add_xdl_int8.cpp @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +// kernel data types +using InKernelDataType = I8; +using WeiKernelDataType = I8; +using AccDataType = I32; +using CShuffleDataType = I8; +using BiasKernelDataType = I8; +using ResidualKernelDataType = I8; +using OutKernelDataType = I8; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; + +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; + +#include "run_grouped_conv_fwd_bias_relu_add_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_bias_relu_add_example(argc, argv); } diff --git a/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp new file mode 100644 index 0000000000..6de1daa3d4 --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/grouped_conv_fwd_xdl_fp16.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +// kernel data types +using InKernelDataType = FP16; +using WeiKernelDataType = FP16; +using AccDataType = FP32; +using CShuffleDataType = FP16; +using OutKernelDataType = FP16; + +// tensor data types +using InUserDataType = InKernelDataType; +using WeiUserDataType = WeiKernelDataType; +using OutUserDataType = OutKernelDataType; + +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using OutElementOp = PassThrough; + +#include "run_grouped_conv_fwd_example.inc" + +int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); } diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc similarity index 59% rename from example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp rename to example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc index a2d9c21287..059ef3e341 100644 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_common.hpp +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_bias_relu_add_example.inc @@ -1,59 +1,110 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. -#include -#include -#include -#include - -#include "ck/ck.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" -#include "ck/library/utility/convolution_parameter.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" - -void print_helper_msg() +template +struct LayoutSetting { - std::cout << "arg1: verification (0=no, 1=yes)\n" - << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" - << "arg3: time kernel (0=no, 1=yes)\n" - << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; -} + using BiasLayout = BiasLay; + using ResidualLayout = ResidualLay; +}; -template -int run_grouped_conv_fwd_bias_relu_add(bool do_verification, - int init_method, - bool time_kernel, - const ck::utils::conv::ConvParam& conv_param, - const HostTensorDescriptor& in_g_n_c_wis_desc, - const HostTensorDescriptor& wei_g_k_c_xs_desc, - const HostTensorDescriptor& bias_g_n_k_wos_desc, - const HostTensorDescriptor& residual_g_n_k_wos_desc, - const HostTensorDescriptor& out_g_n_k_wos_desc, - const InElementOp& in_element_op, - const WeiElementOp& wei_element_op, - const OutElementOp& out_element_op) +template +struct LayoutSettingSelector; + +template <> +struct LayoutSettingSelector<1> final : LayoutSetting { +}; + +template <> +struct LayoutSettingSelector<2> final : LayoutSetting +{ +}; + +template <> +struct LayoutSettingSelector<3> final : LayoutSetting +{ +}; + +template +using BiasLayout = typename LayoutSettingSelector::BiasLayout; + +template +using ResidualLayout = typename LayoutSettingSelector::ResidualLayout; + +template +using DeviceConvFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< + NDimSpatial, + InputLayout, + WeightLayout, + ck::Tuple, ResidualLayout>, + OutputLayout, + InKernelDataType, + WeiKernelDataType, + AccDataType, + CShuffleDataType, + ck::Tuple, + OutKernelDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 16, // KPerBlock + 4, // AK1 + 4, // BK1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 4, // ABlockTransferSrcScalarPerVector + 4, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 4, // BBlockTransferSrcScalarPerVector + 4, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, + 1, + S<1, 16, 1, 16>, + 4>; + +template +using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd; + +template +bool run_grouped_conv_fwd_bias_relu_add(const ExecutionConfig& config, + const ck::utils::conv::ConvParam& conv_param) +{ + static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial"); + + const auto in_g_n_c_wis_desc = make_input_descriptor(conv_param); + const auto wei_g_k_c_xs_desc = make_weight_descriptor(conv_param); + const auto bias_g_n_k_wos_desc = make_bias_descriptor(conv_param); + const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param); + Tensor in(in_g_n_c_wis_desc); Tensor wei(wei_g_k_c_xs_desc); Tensor bias(bias_g_n_k_wos_desc); - Tensor residual(residual_g_n_k_wos_desc); + Tensor residual(bias_g_n_k_wos_desc); Tensor out_host(out_g_n_k_wos_desc); Tensor out_device(out_g_n_k_wos_desc); @@ -63,7 +114,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, std::cout << "residual: " << residual.mDesc << std::endl; std::cout << "out: " << out_host.mDesc << std::endl; - switch(init_method) + switch(config.init_method) { case 0: break; case 1: @@ -83,7 +134,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, DeviceMem residual_device_buf(sizeof(OutKernelDataType) * residual.mDesc.GetElementSpaceSize()); DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize()); -#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +#ifdef BUILD_INT4_EXAMPLE const Tensor in_converted(in); const Tensor wei_converted(wei); const Tensor bias_converted(bias); @@ -93,12 +144,12 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, wei_device_buf.ToDevice(wei_converted.mData.data()); bias_device_buf.ToDevice(bias_converted.mData.data()); residual_device_buf.ToDevice(residual_converted.mData.data()); -#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +#else in_device_buf.ToDevice(in.mData.data()); wei_device_buf.ToDevice(wei.mData.data()); bias_device_buf.ToDevice(bias.mData.data()); residual_device_buf.ToDevice(residual.mData.data()); -#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +#endif std::array a_g_n_c_wis_lengths{}; std::array a_g_n_c_wis_strides{}; @@ -123,8 +174,8 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); copy(bias_g_n_k_wos_desc.GetLengths(), d0_g_n_k_wos_lengths); copy(bias_g_n_k_wos_desc.GetStrides(), d0_g_n_k_wos_strides); - copy(residual_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths); - copy(residual_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides); + copy(bias_g_n_k_wos_desc.GetLengths(), d1_g_n_k_wos_lengths); + copy(bias_g_n_k_wos_desc.GetStrides(), d1_g_n_k_wos_strides); copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); copy(conv_param.conv_filter_strides_, conv_filter_strides); @@ -133,7 +184,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, copy(conv_param.input_right_pads_, input_right_pads); // do Conv - auto conv = DeviceConvNDFwdInstance{}; + auto conv = DeviceConvFwdInstance{}; auto invoker = conv.MakeInvoker(); auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(), @@ -155,9 +206,9 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, conv_filter_dilations, input_left_pads, input_right_pads, - in_element_op, - wei_element_op, - out_element_op); + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); if(!conv.IsSupportedArgument(argument)) { @@ -166,7 +217,7 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, "not support this Conv problem"); } - float avg_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel}); + float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); std::size_t flop = conv_param.GetFlops(); std::size_t num_btype = conv_param.GetByte(); @@ -176,20 +227,11 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " << conv.GetTypeString() << std::endl; - if(do_verification) + if(config.do_verification) { - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - Tensor c_host(out_g_n_k_wos_desc); - auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd(); - + auto ref_conv = HostConvFwdInstance{}; auto ref_invoker = ref_conv.MakeInvoker(); auto ref_argument = ref_conv.MakeArgument(in, wei, @@ -198,36 +240,49 @@ int run_grouped_conv_fwd_bias_relu_add(bool do_verification, conv_param.conv_filter_dilations_, conv_param.input_left_pads_, conv_param.input_right_pads_, - in_element_op, - wei_element_op, + InElementOp{}, + WeiElementOp{}, PassThrough{}); ref_invoker.Run(ref_argument); // TODO: implement elementwise operation for host out_host.ForEach([&](auto&, auto idx) { - out_element_op(out_host(idx), c_host(idx), bias(idx), residual(idx)); + OutElementOp{}(out_host(idx), c_host(idx), bias(idx), residual(idx)); }); out_device_buf.FromDevice(out_device.mData.data()); -#ifdef CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 +#ifdef BUILD_INT4_EXAMPLE const Tensor out_device_converted(out_device); - return ck::utils::check_err(out_device_converted.mData, - out_host.mData, - "Error: incorrect results!", - 1e-5f, - 1e-4f) - ? 0 - : 1; -#else // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 return ck::utils::check_err( - out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f) - ? 0 - : 1; -#endif // CK_EXPERIMENTAL_BIT_INT_EXTENSION_INT4 + out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); +#else + return ck::utils::check_err( + out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); +#endif } - return 0; + return true; +} + +bool run_grouped_conv_fwd_bias_relu_add_example(int argc, char* argv[]) +{ + ExecutionConfig config; + ck::utils::conv::ConvParam conv_param = DefaultConvParam; + + if(!parse_cmd_args(argc, argv, config, conv_param)) + { + return false; + } + + switch(conv_param.num_dim_spatial_) + { + case 1: return run_grouped_conv_fwd_bias_relu_add<1>(config, conv_param); + case 2: return run_grouped_conv_fwd_bias_relu_add<2>(config, conv_param); + case 3: return run_grouped_conv_fwd_bias_relu_add<3>(config, conv_param); + } + + return false; } diff --git a/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc new file mode 100644 index 0000000000..27ddcb6bec --- /dev/null +++ b/example/30_grouped_conv_fwd_multiple_d/run_grouped_conv_fwd_example.inc @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +template +using DeviceConvFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< + NDimSpatial, + InputLayout, + WeightLayout, + ck::Tuple<>, + OutputLayout, + InKernelDataType, + WeiKernelDataType, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + OutKernelDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 16, // KPerBlock + 4, // AK1 + 4, // BK1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 4, // ABlockTransferSrcScalarPerVector + 4, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 4, // BBlockTransferSrcScalarPerVector + 4, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, + 1, + S<1, 16, 1, 16>, + 4>; + +template +using HostConvFwdInstance = ck::tensor_operation::host::ReferenceConvFwd; + +template +bool run_grouped_conv_fwd(const ExecutionConfig& config, + const ck::utils::conv::ConvParam& conv_param) +{ + static_assert(1 <= NDimSpatial && NDimSpatial <= 3, "Unsupported NDimSpatial"); + + const auto in_g_n_c_wis_desc = make_input_descriptor(conv_param); + const auto wei_g_k_c_xs_desc = make_weight_descriptor(conv_param); + const auto out_g_n_k_wos_desc = make_output_descriptor(conv_param); + + Tensor in(in_g_n_c_wis_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor out_host(out_g_n_k_wos_desc); + Tensor out_device(out_g_n_k_wos_desc); + + std::cout << "in: " << in.mDesc << std::endl; + std::cout << "wei: " << wei.mDesc << std::endl; + std::cout << "out: " << out_host.mDesc << std::endl; + + switch(config.init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem in_device_buf(sizeof(InKernelDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiKernelDataType) * wei.mDesc.GetElementSpaceSize()); + DeviceMem out_device_buf(sizeof(OutKernelDataType) * out_device.mDesc.GetElementSpaceSize()); + +#ifdef BUILD_INT4_EXAMPLE + const Tensor in_converted(in); + const Tensor wei_converted(wei); + + in_device_buf.ToDevice(in_converted.mData.data()); + wei_device_buf.ToDevice(wei_converted.mData.data()); +#else + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); +#endif + + std::array a_g_n_c_wis_lengths{}; + std::array a_g_n_c_wis_strides{}; + std::array b_g_k_c_xs_lengths{}; + std::array b_g_k_c_xs_strides{}; + std::array e_g_n_k_wos_lengths{}; + std::array e_g_n_k_wos_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](auto& x, auto& y) { std::copy(x.begin(), x.end(), y.begin()); }; + + copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); + copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); + copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); + copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); + copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); + copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + // do Conv + auto conv = DeviceConvFwdInstance{}; + auto invoker = conv.MakeInvoker(); + auto argument = conv.MakeArgument(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + std::array{}, + out_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + std::array, 0>{}, + std::array, 0>{}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + if(!conv.IsSupportedArgument(argument)) + { + throw std::runtime_error( + "wrong! device_conv with the specified compilation parameters does " + "not support this Conv problem"); + } + + float avg_time = invoker.Run(argument, StreamConfig{nullptr, config.time_kernel}); + + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = conv_param.GetByte(); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, " + << conv.GetTypeString() << std::endl; + + if(config.do_verification) + { + auto ref_conv = HostConvFwdInstance{}; + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in, + wei, + out_host, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + InElementOp{}, + WeiElementOp{}, + OutElementOp{}); + + ref_invoker.Run(ref_argument); + + out_device_buf.FromDevice(out_device.mData.data()); + +#ifdef BUILD_INT4_EXAMPLE + const Tensor out_device_converted(out_device); + + return ck::utils::check_err( + out_device_converted.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); +#else + return ck::utils::check_err( + out_device.mData, out_host.mData, "Error: incorrect results!", 1e-5f, 1e-4f); +#endif + } + + return true; +} + +bool run_grouped_conv_fwd_example(int argc, char* argv[]) +{ + ExecutionConfig config; + ck::utils::conv::ConvParam conv_param = DefaultConvParam; + + if(!parse_cmd_args(argc, argv, config, conv_param)) + { + return false; + } + + switch(conv_param.num_dim_spatial_) + { + case 1: return run_grouped_conv_fwd<1>(config, conv_param); + case 2: return run_grouped_conv_fwd<2>(config, conv_param); + case 3: return run_grouped_conv_fwd<3>(config, conv_param); + } + + return false; +} diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt b/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt deleted file mode 100644 index 98c2211b19..0000000000 --- a/example/30_grouped_convnd_fwd_bias_relu_add/CMakeLists.txt +++ /dev/null @@ -1,11 +0,0 @@ -add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp) - -add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_fp32 grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp) - -add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_bf16 grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp) - -add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int8 grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp) - -if(USE_BITINT_EXTENSION_INT4) - add_example_executable(example_grouped_convnd_fwd_bias_relu_add_xdl_int4 grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp) -endif() # USE_BITINT_EXTENSION_INT4 diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/README.md b/example/30_grouped_convnd_fwd_bias_relu_add/README.md deleted file mode 100644 index eea3364b3f..0000000000 --- a/example/30_grouped_convnd_fwd_bias_relu_add/README.md +++ /dev/null @@ -1,34 +0,0 @@ -```bash -#arg1: verification (0=no, 1=yes) -#arg2: initialization (0=no init, 1=integer value, 2=decimal value) -#arg3: time kernel (0=no, 1=yes) -#Following arguments (depending on number of spatial dims): -# N spatial dimensions -# G, N, K, C, -# , (ie Y, X for 2D) -# , (ie Hi, Wi for 2D) -# , (ie Sy, Sx for 2D) -# , (ie Dy, Dx for 2D) -# , (ie LeftPy, LeftPx for 2D) -# , (ie RightPy, RightPx for 2D) - -bin/example_grouped_convnd_fwd_bias_relu_add_xdl_fp16 1 1 1 -``` - -Result (MI100) -``` -in: dim 5, lengths {2, 128, 192, 71, 71}, strides {192, 1935744, 1, 27264, 384} -wei: dim 5, lengths {2, 256, 192, 3, 3}, strides {442368, 1728, 1, 576, 192} -bias: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0} -residual: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 0, 1, 0, 0} -out: dim 5, lengths {2, 128, 256, 36, 36}, strides {256, 663552, 1, 18432, 512} -A[M, K]: {165888, 1728} -B[N, K]: {256, 1728} -Ds[M, N]: {165888, 256} -Ds[M, N]: {165888, 256} -E[M, N]: {165888, 256} -launch_and_time_kernel: grid_dim {2592, 1, 1}, block_dim {256, 1, 1} -Warm up 1 time -Start running 10 times... -Perf: 2.48075 ms, 118.325 TFlops, 268.946 GB/s, DeviceGroupedConvFwdMultipleD_Xdl_CShuffle<256, 128, 256, 32, Default> -``` \ No newline at end of file diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp deleted file mode 100644 index 984f28c845..0000000000 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_bf16.cpp +++ /dev/null @@ -1,459 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "grouped_convnd_fwd_bias_relu_add_common.hpp" - -#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" - -#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" - -// kernel data types -using InKernelDataType = ck::bhalf_t; -using WeiKernelDataType = ck::bhalf_t; -using AccDataType = float; -using CShuffleDataType = float; -using BiasKernelDataType = ck::bhalf_t; -using ResidualKernelDataType = ck::bhalf_t; -using OutKernelDataType = ck::bhalf_t; - -// tensor data types -using InUserDataType = InKernelDataType; -using WeiUserDataType = WeiKernelDataType; -using OutUserDataType = OutKernelDataType; - -template -using S = ck::Sequence; - -using InElementOp = ck::tensor_operation::element_wise::PassThrough; -using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; -using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; - -static constexpr auto ConvSpec = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - -static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -template -using DeviceGroupedConvNDFwdInstance = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< - NDimSpatial, - InLayout, - WeiLayout, - ck::Tuple, - OutLayout, - InKernelDataType, - WeiKernelDataType, - AccDataType, - CShuffleDataType, - ck::Tuple, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - ConvSpec, // ConvForwardSpecialization - GemmSpec, // GemmSpecialization - 1, // - 256, // BlockSize - 128, // MPerBlock - 256, // NPerBlock - 32, // KPerBlock - 8, // AK1 - 8, // BK1 - 32, // MPerXdl - 32, // NPerXdl - 2, // MXdlPerWave - 4, // NXdlPerWave - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 8, // ABlockTransferSrcScalarPerVector - 8, // ABlockTransferDstScalarPerVector_AK1 - 1, // ABlockLdsExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 8, // BBlockTransferSrcScalarPerVector - 8, // BBlockTransferDstScalarPerVector_BK1 - 1, // BBlockLdsExtraN - 1, - 1, - S<1, 32, 1, 8>, - 8>; - -int main(int argc, char* argv[]) -{ - namespace ctc = ck::tensor_layout::convolution; - - print_helper_msg(); - - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; - - // conventional group conv definition - // G = 2 - // [N, C, Hi, Wi] = [128, 384, 71, 71] - // [K, C, Y, X] = [512, 192, 3, 3] - // [N, K, Ho, Wo] = [128, 512, 36, 36] - // CK group conv definition - // [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71] - // [G, K, C, Y, X] = [2, 256, 192, 3, 3] - // [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36] - ck::utils::conv::ConvParam conv_param{ - 2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; - - if(argc == 1) - { - // use default - } - else if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - } - else - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - const ck::index_t num_dim_spatial = std::stoi(argv[4]); - - conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv); - } - - const auto in_element_op = InElementOp{}; - const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{}; - - if(conv_param.num_dim_spatial_ == 1) - { - using InLayout = ctc::G_NW_C; - using WeiLayout = ctc::G_K_X_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NW_K; - using OutLayout = ctc::G_NW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k - 1, // c - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // k - 1, // c - 0 // x - }); - - const auto residual_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // k - 1, // c - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<1, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<1, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 2) - { - using InLayout = ctc::G_NHW_C; - using WeiLayout = ctc::G_K_YX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NHW_K; - using OutLayout = ctc::G_NHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<2, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<2, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 3) - { - using InLayout = ctc::G_NDHW_C; - using WeiLayout = ctc::G_K_ZYX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NDHW_K; - using OutLayout = ctc::G_NDHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1], - conv_param.input_spatial_lengths_[2]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] * - conv_param.G_ * conv_param.C_, // di - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1], - conv_param.filter_spatial_lengths_[2]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // z - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] * - conv_param.G_ * conv_param.K_, // do - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<3, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<3, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - - return 0; -} diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp deleted file mode 100644 index d5a05a2cf6..0000000000 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp16.cpp +++ /dev/null @@ -1,459 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "grouped_convnd_fwd_bias_relu_add_common.hpp" - -#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" - -#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" - -// kernel data types -using InKernelDataType = ck::half_t; -using WeiKernelDataType = ck::half_t; -using AccDataType = float; -using CShuffleDataType = ck::half_t; -using BiasKernelDataType = ck::half_t; -using ResidualKernelDataType = ck::half_t; -using OutKernelDataType = ck::half_t; - -// tensor data types -using InUserDataType = InKernelDataType; -using WeiUserDataType = WeiKernelDataType; -using OutUserDataType = OutKernelDataType; - -template -using S = ck::Sequence; - -using InElementOp = ck::tensor_operation::element_wise::PassThrough; -using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; -using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; - -static constexpr auto ConvSpec = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - -static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -template -using DeviceGroupedConvNDFwdInstance = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< - NDimSpatial, - InLayout, - WeiLayout, - ck::Tuple, - OutLayout, - InKernelDataType, - WeiKernelDataType, - AccDataType, - CShuffleDataType, - ck::Tuple, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - ConvSpec, // ConvForwardSpecialization - GemmSpec, // GemmSpecialization - 1, // - 256, // BlockSize - 128, // MPerBlock - 256, // NPerBlock - 32, // KPerBlock - 8, // AK1 - 8, // BK1 - 32, // MPerXdl - 32, // NPerXdl - 2, // MXdlPerWave - 4, // NXdlPerWave - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 8, // ABlockTransferSrcScalarPerVector - 8, // ABlockTransferDstScalarPerVector_AK1 - 1, // ABlockLdsExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 8, // BBlockTransferSrcScalarPerVector - 8, // BBlockTransferDstScalarPerVector_BK1 - 1, // BBlockLdsExtraN - 1, - 1, - S<1, 32, 1, 8>, - 8>; - -int main(int argc, char* argv[]) -{ - namespace ctc = ck::tensor_layout::convolution; - - print_helper_msg(); - - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; - - // conventional group conv definition - // G = 2 - // [N, C, Hi, Wi] = [128, 384, 71, 71] - // [K, C, Y, X] = [512, 192, 3, 3] - // [N, K, Ho, Wo] = [128, 512, 36, 36] - // CK group conv definition - // [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71] - // [G, K, C, Y, X] = [2, 256, 192, 3, 3] - // [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36] - ck::utils::conv::ConvParam conv_param{ - 2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; - - if(argc == 1) - { - // use default - } - else if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - } - else - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - const ck::index_t num_dim_spatial = std::stoi(argv[4]); - - conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv); - } - - const auto in_element_op = InElementOp{}; - const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{}; - - if(conv_param.num_dim_spatial_ == 1) - { - using InLayout = ctc::G_NW_C; - using WeiLayout = ctc::G_K_X_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NW_K; - using OutLayout = ctc::G_NW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k - 1, // c - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0 // wo - }); - - const auto residual_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // k - 1, // c - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<1, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<1, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 2) - { - using InLayout = ctc::G_NHW_C; - using WeiLayout = ctc::G_K_YX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NHW_K; - using OutLayout = ctc::G_NHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<2, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<2, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 3) - { - using InLayout = ctc::G_NDHW_C; - using WeiLayout = ctc::G_K_ZYX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NDHW_K; - using OutLayout = ctc::G_NDHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1], - conv_param.input_spatial_lengths_[2]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] * - conv_param.G_ * conv_param.C_, // di - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1], - conv_param.filter_spatial_lengths_[2]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // z - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] * - conv_param.G_ * conv_param.K_, // do - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<3, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<3, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - - return 0; -} diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp deleted file mode 100644 index 2e5dbb5948..0000000000 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_fp32.cpp +++ /dev/null @@ -1,459 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "grouped_convnd_fwd_bias_relu_add_common.hpp" - -#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" - -#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" - -// kernel data types -using InKernelDataType = float; -using WeiKernelDataType = float; -using AccDataType = float; -using CShuffleDataType = float; -using BiasKernelDataType = float; -using ResidualKernelDataType = float; -using OutKernelDataType = float; - -// tensor data types -using InUserDataType = InKernelDataType; -using WeiUserDataType = WeiKernelDataType; -using OutUserDataType = OutKernelDataType; - -template -using S = ck::Sequence; - -using InElementOp = ck::tensor_operation::element_wise::PassThrough; -using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; -using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; - -static constexpr auto ConvSpec = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - -static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -template -using DeviceGroupedConvNDFwdInstance = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< - NDimSpatial, - InLayout, - WeiLayout, - ck::Tuple, - OutLayout, - InKernelDataType, - WeiKernelDataType, - AccDataType, - CShuffleDataType, - ck::Tuple, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - ConvSpec, // ConvForwardSpecialization - GemmSpec, // GemmSpecialization - 1, // - 256, // BlockSize - 128, // MPerBlock - 256, // NPerBlock - 16, // KPerBlock - 4, // AK1 - 4, // BK1 - 32, // MPerXdl - 32, // NPerXdl - 2, // MXdlPerWave - 4, // NXdlPerWave - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 4, // ABlockTransferSrcScalarPerVector - 4, // ABlockTransferDstScalarPerVector_AK1 - 1, // ABlockLdsExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 4, // BBlockTransferSrcScalarPerVector - 4, // BBlockTransferDstScalarPerVector_BK1 - 1, // BBlockLdsExtraN - 1, - 1, - S<1, 16, 1, 16>, - 4>; - -int main(int argc, char* argv[]) -{ - namespace ctc = ck::tensor_layout::convolution; - - print_helper_msg(); - - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; - - // conventional group conv definition - // G = 2 - // [N, C, Hi, Wi] = [128, 384, 71, 71] - // [K, C, Y, X] = [512, 192, 3, 3] - // [N, K, Ho, Wo] = [128, 512, 36, 36] - // CK group conv definition - // [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71] - // [G, K, C, Y, X] = [2, 256, 192, 3, 3] - // [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36] - ck::utils::conv::ConvParam conv_param{ - 2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; - - if(argc == 1) - { - // use default - } - else if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - } - else - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - const ck::index_t num_dim_spatial = std::stoi(argv[4]); - - conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv); - } - - const auto in_element_op = InElementOp{}; - const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{}; - - if(conv_param.num_dim_spatial_ == 1) - { - using InLayout = ctc::G_NW_C; - using WeiLayout = ctc::G_K_X_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NW_K; - using OutLayout = ctc::G_NW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k - 1, // c - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // k - 1, // c - 0 // x - }); - - const auto residual_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // k - 1, // c - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<1, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<1, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 2) - { - using InLayout = ctc::G_NHW_C; - using WeiLayout = ctc::G_K_YX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NHW_K; - using OutLayout = ctc::G_NHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<2, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<2, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 3) - { - using InLayout = ctc::G_NDHW_C; - using WeiLayout = ctc::G_K_ZYX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NDHW_K; - using OutLayout = ctc::G_NDHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1], - conv_param.input_spatial_lengths_[2]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] * - conv_param.G_ * conv_param.C_, // di - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1], - conv_param.filter_spatial_lengths_[2]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // z - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] * - conv_param.G_ * conv_param.K_, // do - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<3, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<3, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - - return 0; -} diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp deleted file mode 100644 index 9c96015cd8..0000000000 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int4.cpp +++ /dev/null @@ -1,459 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "grouped_convnd_fwd_bias_relu_add_common.hpp" - -#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" - -#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" - -// kernel data types -using InKernelDataType = int8_t; -using WeiKernelDataType = int8_t; -using AccDataType = int32_t; -using CShuffleDataType = int8_t; -using BiasKernelDataType = int8_t; -using ResidualKernelDataType = int8_t; -using OutKernelDataType = int8_t; - -// tensor data types -using InUserDataType = ck::int4_t; -using WeiUserDataType = ck::int4_t; -using OutUserDataType = ck::int4_t; - -template -using S = ck::Sequence; - -using InElementOp = ck::tensor_operation::element_wise::PassThrough; -using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; -using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; - -static constexpr auto ConvSpec = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - -static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -template -using DeviceGroupedConvNDFwdInstance = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< - NDimSpatial, - InLayout, - WeiLayout, - ck::Tuple, - OutLayout, - InKernelDataType, - WeiKernelDataType, - AccDataType, - CShuffleDataType, - ck::Tuple, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - ConvSpec, // ConvForwardSpecialization - GemmSpec, // GemmSpecialization - 1, // - 256, // BlockSize - 128, // MPerBlock - 256, // NPerBlock - 64, // KPerBlock - 16, // AK1 - 16, // BK1 - 32, // MPerXdl - 32, // NPerXdl - 2, // MXdlPerWave - 4, // NXdlPerWave - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 16, // ABlockTransferSrcScalarPerVector - 16, // ABlockTransferDstScalarPerVector_AK1 - 1, // ABlockLdsExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 16, // BBlockTransferSrcScalarPerVector - 16, // BBlockTransferDstScalarPerVector_BK1 - 1, // BBlockLdsExtraN - 1, - 1, - S<1, 64, 1, 4>, - 16>; - -int main(int argc, char* argv[]) -{ - namespace ctc = ck::tensor_layout::convolution; - - print_helper_msg(); - - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; - - // conventional group conv definition - // G = 2 - // [N, C, Hi, Wi] = [128, 384, 71, 71] - // [K, C, Y, X] = [512, 192, 3, 3] - // [N, K, Ho, Wo] = [128, 512, 36, 36] - // CK group conv definition - // [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71] - // [G, K, C, Y, X] = [2, 256, 192, 3, 3] - // [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36] - ck::utils::conv::ConvParam conv_param{ - 2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; - - if(argc == 1) - { - // use default - } - else if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - } - else - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - const ck::index_t num_dim_spatial = std::stoi(argv[4]); - - conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv); - } - - const auto in_element_op = InElementOp{}; - const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{}; - - if(conv_param.num_dim_spatial_ == 1) - { - using InLayout = ctc::G_NW_C; - using WeiLayout = ctc::G_K_X_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NW_K; - using OutLayout = ctc::G_NW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k - 1, // c - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // k - 1, // c - 0 // x - }); - - const auto residual_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // k - 1, // c - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<1, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<1, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 2) - { - using InLayout = ctc::G_NHW_C; - using WeiLayout = ctc::G_K_YX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NHW_K; - using OutLayout = ctc::G_NHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<2, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<2, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 3) - { - using InLayout = ctc::G_NDHW_C; - using WeiLayout = ctc::G_K_ZYX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NDHW_K; - using OutLayout = ctc::G_NDHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1], - conv_param.input_spatial_lengths_[2]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] * - conv_param.G_ * conv_param.C_, // di - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1], - conv_param.filter_spatial_lengths_[2]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // z - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] * - conv_param.G_ * conv_param.K_, // do - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<3, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<3, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - - return 0; -} diff --git a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp b/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp deleted file mode 100644 index 3a366ceceb..0000000000 --- a/example/30_grouped_convnd_fwd_bias_relu_add/grouped_convnd_fwd_bias_relu_add_xdl_int8.cpp +++ /dev/null @@ -1,459 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "grouped_convnd_fwd_bias_relu_add_common.hpp" - -#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_cshuffle.hpp" - -#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" - -// kernel data types -using InKernelDataType = int8_t; -using WeiKernelDataType = int8_t; -using AccDataType = int32_t; -using CShuffleDataType = int8_t; -using BiasKernelDataType = int8_t; -using ResidualKernelDataType = int8_t; -using OutKernelDataType = int8_t; - -// tensor data types -using InUserDataType = InKernelDataType; -using WeiUserDataType = WeiKernelDataType; -using OutUserDataType = OutKernelDataType; - -template -using S = ck::Sequence; - -using InElementOp = ck::tensor_operation::element_wise::PassThrough; -using WeiElementOp = ck::tensor_operation::element_wise::PassThrough; -using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd; - -static constexpr auto ConvSpec = - ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; - -static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; - -template -using DeviceGroupedConvNDFwdInstance = - ck::tensor_operation::device::DeviceGroupedConvFwdMultipleD_Xdl_CShuffle< - NDimSpatial, - InLayout, - WeiLayout, - ck::Tuple, - OutLayout, - InKernelDataType, - WeiKernelDataType, - AccDataType, - CShuffleDataType, - ck::Tuple, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - ConvSpec, // ConvForwardSpecialization - GemmSpec, // GemmSpecialization - 1, // - 256, // BlockSize - 128, // MPerBlock - 256, // NPerBlock - 64, // KPerBlock - 16, // AK1 - 16, // BK1 - 32, // MPerXdl - 32, // NPerXdl - 2, // MXdlPerWave - 4, // NXdlPerWave - S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 - S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // ABlockTransferSrcAccessOrder - 2, // ABlockTransferSrcVectorDim - 16, // ABlockTransferSrcScalarPerVector - 16, // ABlockTransferDstScalarPerVector_AK1 - 1, // ABlockLdsExtraM - S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 - S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder - S<1, 0, 2>, // BBlockTransferSrcAccessOrder - 2, // BBlockTransferSrcVectorDim - 16, // BBlockTransferSrcScalarPerVector - 16, // BBlockTransferDstScalarPerVector_BK1 - 1, // BBlockLdsExtraN - 1, - 1, - S<1, 64, 1, 4>, - 16>; - -int main(int argc, char* argv[]) -{ - namespace ctc = ck::tensor_layout::convolution; - - print_helper_msg(); - - bool do_verification = true; - int init_method = 1; - bool time_kernel = false; - - // conventional group conv definition - // G = 2 - // [N, C, Hi, Wi] = [128, 384, 71, 71] - // [K, C, Y, X] = [512, 192, 3, 3] - // [N, K, Ho, Wo] = [128, 512, 36, 36] - // CK group conv definition - // [G, N, C, Hi, Wi] = [2, 128, 192, 71, 71] - // [G, K, C, Y, X] = [2, 256, 192, 3, 3] - // [G, N, K, Ho, Wo] = [2, 128, 256, 36, 36] - ck::utils::conv::ConvParam conv_param{ - 2, 2, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; - - if(argc == 1) - { - // use default - } - else if(argc == 4) - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - } - else - { - do_verification = std::stoi(argv[1]); - init_method = std::stoi(argv[2]); - time_kernel = std::stoi(argv[3]); - const ck::index_t num_dim_spatial = std::stoi(argv[4]); - - conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv); - } - - const auto in_element_op = InElementOp{}; - const auto wei_element_op = WeiElementOp{}; - const auto out_element_op = OutElementOp{}; - - if(conv_param.num_dim_spatial_ == 1) - { - using InLayout = ctc::G_NW_C; - using WeiLayout = ctc::G_K_X_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NW_K; - using OutLayout = ctc::G_NW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.C_, conv_param.input_spatial_lengths_[0]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.K_, conv_param.C_, conv_param.filter_spatial_lengths_[0]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.C_, // k - 1, // c - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // k - 1, // c - 0 // x - }); - - const auto residual_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - 0, // k - 1, // c - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, conv_param.N_, conv_param.K_, conv_param.output_spatial_lengths_[0]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<1, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<1, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 2) - { - using InLayout = ctc::G_NHW_C; - using WeiLayout = ctc::G_K_YX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NHW_K; - using OutLayout = ctc::G_NHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // ho - 0 // wo - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<2, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<2, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - else if(conv_param.num_dim_spatial_ == 3) - { - using InLayout = ctc::G_NDHW_C; - using WeiLayout = ctc::G_K_ZYX_C; - using BiasLayout = ctc::G_K; - using ResidualLayout = ctc::G_NDHW_K; - using OutLayout = ctc::G_NDHW_K; - - const auto in_g_n_c_wis_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.C_, - conv_param.input_spatial_lengths_[0], - conv_param.input_spatial_lengths_[1], - conv_param.input_spatial_lengths_[2]}, - { - conv_param.C_, // g - conv_param.input_spatial_lengths_[0] * conv_param.input_spatial_lengths_[1] * - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // n - 1, // c - conv_param.input_spatial_lengths_[1] * conv_param.input_spatial_lengths_[2] * - conv_param.G_ * conv_param.C_, // di - conv_param.input_spatial_lengths_[2] * conv_param.G_ * conv_param.C_, // hi - conv_param.G_ * conv_param.C_ // wi - }); - - const auto wei_g_k_c_xs_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.K_, - conv_param.C_, - conv_param.filter_spatial_lengths_[0], - conv_param.filter_spatial_lengths_[1], - conv_param.filter_spatial_lengths_[2]}, - { - conv_param.K_ * conv_param.filter_spatial_lengths_[0] * - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // g - conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1] * - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // k - 1, // c - conv_param.filter_spatial_lengths_[1] * conv_param.filter_spatial_lengths_[2] * - conv_param.C_, // z - conv_param.filter_spatial_lengths_[2] * conv_param.C_, // y - conv_param.C_ // x - }); - - const auto bias_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto residual_g_n_k_wos_desc = - HostTensorDescriptor({conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - 0, // n - 1, // k - 0, // z - 0, // y - 0 // x - }); - - const auto out_g_n_k_wos_desc = HostTensorDescriptor( - {conv_param.G_, - conv_param.N_, - conv_param.K_, - conv_param.output_spatial_lengths_[0], - conv_param.output_spatial_lengths_[1], - conv_param.output_spatial_lengths_[2]}, - { - conv_param.K_, // g - conv_param.output_spatial_lengths_[0] * conv_param.output_spatial_lengths_[1] * - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // n - 1, // k - conv_param.output_spatial_lengths_[1] * conv_param.output_spatial_lengths_[2] * - conv_param.G_ * conv_param.K_, // do - conv_param.output_spatial_lengths_[2] * conv_param.G_ * conv_param.K_, // ho - conv_param.G_ * conv_param.K_ // wo - }); - - return run_grouped_conv_fwd_bias_relu_add<3, - InKernelDataType, - WeiKernelDataType, - CShuffleDataType, - OutKernelDataType, - InElementOp, - WeiElementOp, - OutElementOp, - InUserDataType, - WeiUserDataType, - OutUserDataType, - DeviceGroupedConvNDFwdInstance<3, - InLayout, - WeiLayout, - BiasLayout, - ResidualLayout, - OutLayout>>( - do_verification, - init_method, - time_kernel, - conv_param, - in_g_n_c_wis_desc, - wei_g_k_c_xs_desc, - bias_g_n_k_wos_desc, - residual_g_n_k_wos_desc, - out_g_n_k_wos_desc, - in_element_op, - wei_element_op, - out_element_op); - } - - return 0; -} diff --git a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp index 481e2e6aee..644c7ee9a9 100644 --- a/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp +++ b/include/ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp @@ -14,39 +14,38 @@ namespace device { // Convolution Forward: // input : input image A[G, N, C, Hi, Wi], // input : weight B[G, K, C, Y, X], -// input : D0[G, N, K, Ho, Wo], D1[G, N, K, Ho, Wo], ... // output : output image E[G, N, K, Ho, Wo] // C = a_op(A) * b_op(B) // E = cde_op(C, D0, D1, ...) template + typename InLayout, + typename WeiLayout, + typename OutLayout, + typename InDataType, + typename WeiDataType, + typename OutDataType, + typename InElementwiseOperation, + typename WeiElementwiseOperation, + typename OutElementwiseOperation> struct DeviceGroupedConvFwd : public BaseOperator { virtual std::unique_ptr - MakeArgumentPointer(const void* p_a, // input image - const void* p_b, // weight - void* p_c, // output image - const std::array& a_g_n_c_wis_lengths, - const std::array& a_g_n_c_wis_strides, - const std::array& b_g_k_c_xs_lengths, - const std::array& b_g_k_c_xs_strides, - const std::array& c_g_n_k_wos_lengths, - const std::array& c_g_n_k_wos_strides, + MakeArgumentPointer(const void* p_in, // input image + const void* p_wei, // weight + void* p_out, // output image + const std::array& in_g_n_c_wis_lengths, + const std::array& in_g_n_c_wis_strides, + const std::array& wei_g_k_c_xs_lengths, + const std::array& wei_g_k_c_xs_strides, + const std::array& out_g_n_k_wos_lengths, + const std::array& out_g_n_k_wos_strides, const std::array& conv_filter_strides, const std::array& conv_filter_dilations, const std::array& input_left_pads, const std::array& input_right_pads, - const AElementwiseOperation& a_element_op, - const BElementwiseOperation& b_element_op, - const CElementwiseOperation& c_element_op) = 0; + const InElementwiseOperation& in_element_op, + const WeiElementwiseOperation& wei_element_op, + const OutElementwiseOperation& out_element_op) = 0; virtual std::unique_ptr MakeInvokerPointer() = 0; }; diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index 9f71af75cc..f8d408dfff 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -95,7 +95,7 @@ template using Add_Activation_Mul_Clamp = ck::tensor_operation::element_wise::Add_Activation_Mul_Clamp; -template +template struct DeviceOperationInstanceFactory; } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp index 6d645ec6fb..90f2a1d6bd 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp @@ -3,11 +3,9 @@ #pragma once -#include - #include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp index c8ce393179..cd07cc3123 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp @@ -3,11 +3,9 @@ #pragma once -#include - #include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_conv_fwd.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" diff --git a/profiler/include/profile_grouped_conv_fwd_impl.hpp b/profiler/include/profile_grouped_conv_fwd_impl.hpp index 32bded1bd4..e0ed15f687 100644 --- a/profiler/include/profile_grouped_conv_fwd_impl.hpp +++ b/profiler/include/profile_grouped_conv_fwd_impl.hpp @@ -9,12 +9,9 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_d.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward.hpp" - -#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_dl.hpp" #include "ck/library/utility/check_err.hpp" @@ -224,26 +221,25 @@ bool profile_grouped_conv_fwd_impl(int do_verification, for(auto& op_ptr : op_ptrs) { - auto argument_ptr = op_ptr->MakeArgumentPointer( - in_device_buf.GetDeviceBuffer(), - wei_device_buf.GetDeviceBuffer(), - std::array{}, - out_device_buf.GetDeviceBuffer(), - a_g_n_c_wis_lengths, - a_g_n_c_wis_strides, - b_g_k_c_xs_lengths, - b_g_k_c_xs_strides, - std::array, 0>{{}}, - std::array, 0>{{}}, - e_g_n_k_wos_lengths, - e_g_n_k_wos_strides, - conv_filter_strides, - conv_filter_dilations, - input_left_pads, - input_right_pads, - in_element_op, - wei_element_op, - out_element_op); + auto argument_ptr = op_ptr->MakeArgumentPointer(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + {}, + out_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + {}, + {}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + out_element_op); run_impl(op_ptr, argument_ptr); } @@ -262,8 +258,10 @@ bool profile_grouped_conv_fwd_impl(int do_verification, WeiElementOp, OutElementOp>; + // get device op instances const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< DeviceOp>::GetInstances(); + std::cout << "dl found " << op_ptrs.size() << " instances" << std::endl; for(auto& op_ptr : op_ptrs)