diff --git a/CMakeLists.txt b/CMakeLists.txt index acae1f5ece..867109f429 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,6 +41,7 @@ include(CTest) option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON) option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF) option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF) +option(CK_EXPERIMENTAL_PROFILER "Enable experimental profiler" ON) option(BUILD_MHA_LIB "Build the static library for flash attention" OFF) option(FORCE_DISABLE_XDL "Skip compiling XDL specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF) option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF) @@ -50,6 +51,12 @@ if(CK_EXPERIMENTAL_BUILDER) include_directories(${PROJECT_SOURCE_DIR}/experimental/builder/include) endif() +if(CK_EXPERIMENTAL_PROFILER) + add_definitions(-DCK_EXPERIMENTAL_PROFILEr) + include_directories(${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include) +endif() + + # Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8" # CK Codegen requires dataclass which is added in Python 3.7 # Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04 @@ -729,6 +736,10 @@ if (CK_EXPERIMENTAL_BUILDER) add_subdirectory(experimental/builder) endif() +if (CK_EXPERIMENTAL_PROFILER) + add_subdirectory(experimental/ck_tile_profiler) +endif() + if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS)) add_subdirectory(codegen) endif() @@ -768,6 +779,13 @@ if(CK_EXPERIMENTAL_BUILDER) ) endif() +if(CK_EXPERIMENTAL_PROFILER) + rocm_install(DIRECTORY + ${PROJECT_SOURCE_DIR}/ck_tile_profiler + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile + ) +endif() + set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") set(CPACK_RPM_PACKAGE_LICENSE "MIT") diff --git a/profiler/ck_tile/CMakeLists.txt b/experimental/ck_tile_profiler/CMakeLists.txt similarity index 75% rename from profiler/ck_tile/CMakeLists.txt rename to experimental/ck_tile_profiler/CMakeLists.txt index ee775efc03..bdd7125ac1 100644 --- a/profiler/ck_tile/CMakeLists.txt +++ b/experimental/ck_tile_profiler/CMakeLists.txt @@ -2,4 +2,4 @@ include_directories(BEFORE ${CMAKE_CURRENT_LIST_DIR}/include ) -add_subdirectory(src) \ No newline at end of file +add_subdirectory(src) diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_data_impl.hpp b/experimental/ck_tile_profiler/include/tile_profile_grouped_conv_bwd_data_impl.hpp similarity index 69% rename from profiler/ck_tile/include/tile_profile_grouped_conv_bwd_data_impl.hpp rename to experimental/ck_tile_profiler/include/tile_profile_grouped_conv_bwd_data_impl.hpp index e1ecbad9e2..99b25b4289 100644 --- a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_data_impl.hpp +++ b/experimental/ck_tile_profiler/include/tile_profile_grouped_conv_bwd_data_impl.hpp @@ -52,12 +52,12 @@ template bool profile_grouped_conv_bwd_data_impl(int do_verification, - int init_method, - bool /*do_log*/, - bool time_kernel, - const ck_tile::conv::ConvParam& conv_param, - const std::string& split_k, - ck_tile::index_t instance_index = -1) + int init_method, + bool /*do_log*/, + bool time_kernel, + const ck_tile::conv::ConvParam& conv_param, + const std::string& split_k, + ck_tile::index_t instance_index = -1) { using AccDataType = float; using InElementOp = ck_tile::element_wise::PassThrough; @@ -93,24 +93,21 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, ck_tile::FillUniformDistribution{-1.f, 1.f}(weight); ck_tile::FillUniformDistribution{-1.f, 1.f}(output); break; - default: - weight.SetZero(); - output.SetZero(); + default: weight.SetZero(); output.SetZero(); } - using DeviceOp = ops::GroupedConvolutionBackwardDataBaseInvoker< - NDimSpatial, - InLayout, - WeiLayout, - OutLayout, - InDataType, - WeiDataType, - OutDataType, - InElementOp, - WeiElementOp, - OutElementOp, - ComputeTypeA, - ComputeTypeB>; + using DeviceOp = ops::GroupedConvolutionBackwardDataBaseInvoker; // get device op instances const auto ops = ck_tile::ops::DeviceOperationInstanceFactory::GetInstances(); @@ -123,14 +120,15 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, float best_gb_per_sec = 0; std::string best_split_k("1"); - // std::vector split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304}; + // std::vector split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, + // 128, 152, 256, 304}; std::vector split_k_list = {1, 2, 3, 4, 6, 8, 12, 16}; if(split_k != "all") { try { ck_tile::index_t split_k_value = std::stoi(split_k); - split_k_list = {split_k_value}; + split_k_list = {split_k_value}; } catch(const std::exception& e) { @@ -142,21 +140,21 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, // First, calculate the reference result if verification is needed. ck_tile::HostTensor input_host_ref(in_g_n_c_wis_desc); input_host_ref.SetZero(); - if (do_verification) + if(do_verification) { ck_tile::reference_grouped_conv_bwd_data( - input_host_ref, - weight, - output, - conv_param.conv_filter_strides_, - conv_param.conv_filter_dilations_, - conv_param.input_left_pads_, - conv_param.input_right_pads_); + input_host_ref, + weight, + output, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); } - //instance_index = 0; + // instance_index = 0; index_t num_kernel = 0; - bool all_pass = true; + bool all_pass = true; for(auto& op : ops) { for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++) @@ -173,13 +171,13 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, output_dev_buf.ToDevice(output.data()); ck_tile::GroupedConvBwdDataHostArgs args(conv_param, - input_dev_buf.GetDeviceBuffer(), - weight_dev_buf.GetDeviceBuffer(), - {}, - output_dev_buf.GetDeviceBuffer(), - split_k_value); + input_dev_buf.GetDeviceBuffer(), + weight_dev_buf.GetDeviceBuffer(), + {}, + output_dev_buf.GetDeviceBuffer(), + split_k_value); - // Split-K autodeduction is not supported. + // Split-K autodeduction is not supported. if(op->IsSupportedArgument(args) && split_k_value >= 1) { num_kernel++; @@ -190,10 +188,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, } std::string op_name = op->GetName(args); - std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl; + std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." + << std::endl; - // Run verification first. If it doesn't pass, no need to do performance measurement. - bool pass = false; + // Run verification first. If it doesn't pass, no need to do performance + // measurement. + bool pass = false; if(do_verification) { constexpr int n_warmup = 0; @@ -202,8 +202,9 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, op->Run(args, false, n_warmup, n_repeat); input_dev_buf.FromDevice(input.data()); - const ck_tile::index_t GemmK = conv_param.K_ - * conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1]; + const ck_tile::index_t GemmK = conv_param.K_ * + conv_param.filter_spatial_lengths_[0] * + conv_param.filter_spatial_lengths_[1]; const float max_accumulated_value = *std::max_element(input_host_ref.mData.begin(), input_host_ref.mData.end()); @@ -212,37 +213,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, GemmK, split_k_value, max_accumulated_value); pass = ck_tile::check_err(input, - input_host_ref, - "Error: Incorrect results!", - rtol_atol.at(ck_tile::number<0>{}), - rtol_atol.at(ck_tile::number<1>{})); + input_host_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) - << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) - << std::endl; - std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl; + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") + << std::endl; all_pass &= pass; } bool is_valid = do_verification ? pass : true; - if (is_valid) + if(is_valid) { constexpr int n_warmup = 5; constexpr int n_repeat = 50; - float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); + float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); - std::size_t flop = conv_param.GetFlops(); - std::size_t num_btype = conv_param.GetByte(); + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = + conv_param.GetByte(); float tflops = static_cast(flop) / 1.E9 / avg_time; float gb_per_sec = num_btype / 1.E6 / avg_time; std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops - << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK " - << split_k_param_str << std::endl; - + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK " + << split_k_param_str << std::endl; + if(tflops > best_tflops) { best_op_name = op_name; @@ -253,7 +256,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, } } } - else + else { std::cout << op->GetName(args) << ", SplitK " << split_k_param_str << " does not support this problem." << std::endl; @@ -262,14 +265,11 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification, } std::stringstream ss; - ss << "\n********************************" - << "\nCK Tile best configuration parameters:" - << "\n********************************" - << "\nname: " << best_op_name - << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops - << "\nGB/s: " << best_gb_per_sec - << "\nSplitK: " << best_split_k - << std::endl; + ss << "\n********************************" + << "\nCK Tile best configuration parameters:" << "\n********************************" + << "\nname: " << best_op_name << "\navg_time: " << best_avg_time + << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec + << "\nSplitK: " << best_split_k << std::endl; std::cout << ss.str(); diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp b/experimental/ck_tile_profiler/include/tile_profile_grouped_conv_bwd_weight_impl.hpp similarity index 70% rename from profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp rename to experimental/ck_tile_profiler/include/tile_profile_grouped_conv_bwd_weight_impl.hpp index d9fd8b9e30..69c6775347 100644 --- a/profiler/ck_tile/include/tile_profile_grouped_conv_bwd_weight_impl.hpp +++ b/experimental/ck_tile_profiler/include/tile_profile_grouped_conv_bwd_weight_impl.hpp @@ -93,24 +93,21 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, ck_tile::FillUniformDistribution{0.f, 1.f}(input); ck_tile::FillUniformDistribution{0.f, 1.f}(output); break; - default: - input.SetZero(); - output.SetZero(); + default: input.SetZero(); output.SetZero(); } - using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker< - NDimSpatial, - InLayout, - WeiLayout, - OutLayout, - InDataType, - WeiDataType, - OutDataType, - InElementOp, - WeiElementOp, - OutElementOp, - ComputeTypeA, - ComputeTypeB>; + using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker; // get device op instances const auto ops = ck_tile::ops::DeviceOperationInstanceFactory::GetInstances(); @@ -123,13 +120,14 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, float best_gb_per_sec = 0; std::string best_split_k("1"); - std::vector split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304}; + std::vector split_k_list = { + 1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304}; if(split_k != "all") { try { ck_tile::index_t split_k_value = std::stoi(split_k); - split_k_list = {split_k_value}; + split_k_list = {split_k_value}; } catch(const std::exception& e) { @@ -141,21 +139,21 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, // First, calculate the reference result if verification is needed. ck_tile::HostTensor weight_host_ref(wei_g_k_c_xs_desc); weight_host_ref.SetZero(); - if (do_verification) + if(do_verification) { - ck_tile::reference_grouped_conv_bwd_weight( - input, - weight_host_ref, - output, - conv_param.conv_filter_strides_, - conv_param.conv_filter_dilations_, - conv_param.input_left_pads_, - conv_param.input_right_pads_); + ck_tile:: + reference_grouped_conv_bwd_weight( + input, + weight_host_ref, + output, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); } - index_t num_kernel = 0; - bool all_pass = true; + bool all_pass = true; for(auto& op : ops) { for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++) @@ -172,13 +170,13 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, output_dev_buf.ToDevice(output.data()); ck_tile::GroupedConvBwdWeightHostArgs args(conv_param, - input_dev_buf.GetDeviceBuffer(), - weight_dev_buf.GetDeviceBuffer(), - {}, - output_dev_buf.GetDeviceBuffer(), - split_k_value); + input_dev_buf.GetDeviceBuffer(), + weight_dev_buf.GetDeviceBuffer(), + {}, + output_dev_buf.GetDeviceBuffer(), + split_k_value); - // Split-K autodeduction is not supported. + // Split-K autodeduction is not supported. if(op->IsSupportedArgument(args) && split_k_value >= 1) { num_kernel++; @@ -189,10 +187,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, } std::string op_name = op->GetName(args); - std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl; + std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." + << std::endl; - // Run verification first. If it doesn't pass, no need to do performance measurement. - bool pass = false; + // Run verification first. If it doesn't pass, no need to do performance + // measurement. + bool pass = false; if(do_verification) { constexpr int n_warmup = 0; @@ -201,45 +201,48 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, op->Run(args, false, n_warmup, n_repeat); weight_dev_buf.FromDevice(weight.data()); - const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_); - const float max_accumulated_value = - *std::max_element(weight_host_ref.mData.begin(), weight_host_ref.mData.end()); + const ck_tile::index_t GemmK = + weight.get_element_size() / (conv_param.G_ * conv_param.K_); + const float max_accumulated_value = *std::max_element( + weight_host_ref.mData.begin(), weight_host_ref.mData.end()); const auto rtol_atol = calculate_rtol_atol( GemmK, split_k_value, max_accumulated_value); pass = ck_tile::check_err(weight, - weight_host_ref, - "Error: Incorrect results!", - rtol_atol.at(ck_tile::number<0>{}), - rtol_atol.at(ck_tile::number<1>{})); + weight_host_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) - << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) - << std::endl; - std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl; + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") + << std::endl; all_pass &= pass; } bool is_valid = do_verification ? pass : true; - if (is_valid) + if(is_valid) { constexpr int n_warmup = 5; constexpr int n_repeat = 50; - float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); + float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); - std::size_t flop = conv_param.GetFlops(); - std::size_t num_btype = conv_param.GetByte(); + std::size_t flop = conv_param.GetFlops(); + std::size_t num_btype = + conv_param.GetByte(); float tflops = static_cast(flop) / 1.E9 / avg_time; float gb_per_sec = num_btype / 1.E6 / avg_time; std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops - << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK " - << split_k_param_str << std::endl; - + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK " + << split_k_param_str << std::endl; + if(tflops > best_tflops) { best_op_name = op_name; @@ -250,7 +253,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, } } } - else + else { std::cout << op->GetName(args) << ", SplitK " << split_k_param_str << " does not support this problem." << std::endl; @@ -259,14 +262,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification, } std::stringstream ss; - ss << "\n********************************" - << "\nCK Tile best configuration parameters:" - << "\n********************************" - << "\nname: " << best_op_name - << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops - << "\nGB/s: " << best_gb_per_sec - << "\nSplitK: " << best_split_k - << std::endl; + ss << "\n********************************" + << "\nCK Tile best configuration parameters:" << "\n********************************" + << "\nname: " << best_op_name << "\navg_time: " << best_avg_time + << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec + << "\nSplitK: " << best_split_k << std::endl; std::cout << ss.str(); diff --git a/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp b/experimental/ck_tile_profiler/include/tile_profile_grouped_conv_fwd_impl.hpp similarity index 70% rename from profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp rename to experimental/ck_tile_profiler/include/tile_profile_grouped_conv_fwd_impl.hpp index 3bc0609950..e541cdd2d8 100644 --- a/profiler/ck_tile/include/tile_profile_grouped_conv_fwd_impl.hpp +++ b/experimental/ck_tile_profiler/include/tile_profile_grouped_conv_fwd_impl.hpp @@ -52,12 +52,12 @@ template bool profile_grouped_conv_fwd_impl(int do_verification, - int init_method, - bool /*do_log*/, - bool time_kernel, - const ck_tile::conv::ConvParam& conv_param, - const ck_tile::index_t k_batch, - ck_tile::index_t instance_index = -1) + int init_method, + bool /*do_log*/, + bool time_kernel, + const ck_tile::conv::ConvParam& conv_param, + const ck_tile::index_t k_batch, + ck_tile::index_t instance_index = -1) { using AccDataType = float; using InElementOp = ck_tile::element_wise::PassThrough; @@ -100,19 +100,18 @@ bool profile_grouped_conv_fwd_impl(int do_verification, weight.SetZero(); } - using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker< - NDimSpatial, - InLayout, - WeiLayout, - OutLayout, - InDataType, - WeiDataType, - OutDataType, - InElementOp, - WeiElementOp, - OutElementOp, - ComputeTypeA, - ComputeTypeB>; + using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker; // get device op instances const auto ops = ck_tile::ops::DeviceOperationInstanceFactory::GetInstances(); @@ -125,7 +124,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, float best_gb_per_sec = 0; index_t num_kernel = 0; - bool all_pass = true; + bool all_pass = true; // tmp enforce instance // instance_index = -1; @@ -141,12 +140,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification, output_dev_buf.SetZero(); ck_tile::GroupedConvFwdHostArgs args(conv_param, - input_dev_buf.GetDeviceBuffer(), - weight_dev_buf.GetDeviceBuffer(), - {}, - output_dev_buf.GetDeviceBuffer(), - k_batch); - + input_dev_buf.GetDeviceBuffer(), + weight_dev_buf.GetDeviceBuffer(), + {}, + output_dev_buf.GetDeviceBuffer(), + k_batch); + if(op->IsSupportedArgument(args)) { num_kernel++; @@ -160,7 +159,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, std::cout << op_name << " is profiled..." << std::endl; // Run verification first. If it doesn't pass, no need to do performance measurement. - bool pass = false; + bool pass = false; if(do_verification) { constexpr int n_warmup = 0; @@ -172,39 +171,42 @@ bool profile_grouped_conv_fwd_impl(int do_verification, ck_tile::HostTensor output_host_ref(out_g_n_k_wos_desc); output_host_ref.SetZero(); - ck_tile::reference_grouped_conv_fwd( - input, - weight, - output_host_ref, - conv_param.conv_filter_strides_, - conv_param.conv_filter_dilations_, - conv_param.input_left_pads_, - conv_param.input_right_pads_); - const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_); + ck_tile:: + reference_grouped_conv_fwd( + input, + weight, + output_host_ref, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_); + const ck_tile::index_t GemmK = + weight.get_element_size() / (conv_param.G_ * conv_param.K_); const float max_accumulated_value = *std::max_element(output_host_ref.mData.begin(), output_host_ref.mData.end()); const auto rtol_atol = calculate_rtol_atol( GemmK, k_batch, max_accumulated_value); pass = ck_tile::check_err(output, - output_host_ref, - "Error: Incorrect results!", - rtol_atol.at(ck_tile::number<0>{}), - rtol_atol.at(ck_tile::number<1>{})); + output_host_ref, + "Error: Incorrect results!", + rtol_atol.at(ck_tile::number<0>{}), + rtol_atol.at(ck_tile::number<1>{})); std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{}) - << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) - << std::endl; - std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl; + << " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{}) + << std::endl; + std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") + << std::endl; all_pass &= pass; } bool is_valid = do_verification ? pass : true; - if (is_valid) + if(is_valid) { constexpr int n_warmup = 5; constexpr int n_repeat = 50; - float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); + float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat); std::size_t flop = conv_param.GetFlops(); std::size_t num_btype = conv_param.GetByte(); @@ -213,7 +215,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification, float gb_per_sec = num_btype / 1.E6 / avg_time; std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops - << " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl; + << " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl; if(tflops > best_tflops) { @@ -224,18 +226,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification, } } } - else + else { - //std::cout << op->GetName(args) << " does not support this problem." << std::endl; + // std::cout << op->GetName(args) << " does not support this problem." << std::endl; } } - std::cout << "\n********************************" - << "\nBest configuration parameters:" - << "\n********************************" - << "\nname: " << best_op_name - << "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops - << "\nGB/s: " << best_gb_per_sec << std::endl; + std::cout << "\n********************************" + << "\nBest configuration parameters:" << "\n********************************" + << "\nname: " << best_op_name << "\navg_time: " << best_avg_time + << "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl; const char* log_file = std::getenv("CK_TILE_PROFILER_LOG_FILE"); if(log_file != nullptr) diff --git a/profiler/ck_tile/include/tile_profiler_operation_registry.hpp b/experimental/ck_tile_profiler/include/tile_profiler_operation_registry.hpp similarity index 100% rename from profiler/ck_tile/include/tile_profiler_operation_registry.hpp rename to experimental/ck_tile_profiler/include/tile_profiler_operation_registry.hpp diff --git a/profiler/ck_tile/src/CMakeLists.txt b/experimental/ck_tile_profiler/src/CMakeLists.txt similarity index 100% rename from profiler/ck_tile/src/CMakeLists.txt rename to experimental/ck_tile_profiler/src/CMakeLists.txt diff --git a/profiler/ck_tile/src/tile_profile_grouped_conv_bwd_data.cpp b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_data.cpp similarity index 86% rename from profiler/ck_tile/src/tile_profile_grouped_conv_bwd_data.cpp rename to experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_data.cpp index bfd5393407..e142a776c7 100644 --- a/profiler/ck_tile/src/tile_profile_grouped_conv_bwd_data.cpp +++ b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_data.cpp @@ -25,9 +25,9 @@ enum struct ConvLayout enum struct ConvDataType { - F32_F32_F32, // 0 - F16_F16_F16, // 1 - BF16_BF16_BF16, // 2 + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 }; #define OP_NAME "grouped_conv_bwd_data" @@ -38,14 +38,14 @@ static void print_helper_msg() std::string conv_param_parser_helper_msg; conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n" - " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" - " G, N, K, C, \n" - " , (ie Y, X for 2D)\n" - " , (ie Hi, Wi for 2D)\n" - " , (ie Sy, Sx for 2D)\n" - " , (ie Dy, Dx for 2D)\n" - " , (ie LeftPy, LeftPx for 2D)\n" - " , (ie RightPy, RightPx for 2D)\n"; + " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + " G, N, K, C, \n" + " , (ie Y, X for 2D)\n" + " , (ie Hi, Wi for 2D)\n" + " , (ie Sy, Sx for 2D)\n" + " , (ie Dy, Dx for 2D)\n" + " , (ie LeftPy, LeftPx for 2D)\n" + " , (ie RightPy, RightPx for 2D)\n"; std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" @@ -111,17 +111,17 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[]) // using F8 = ck_tile::fp8_t; // using BF8 = ck_tile::bf8_t; - using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; + using NHWGC = ck_tile::tensor_layout::convolution::NHWGC; // using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC; - using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; + using GKYXC = ck_tile::tensor_layout::convolution::GKYXC; // using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC; - using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; + using NHWGK = ck_tile::tensor_layout::convolution::NHWGK; // using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK; constexpr auto I2 = ck_tile::number<2>{}; - //constexpr auto I3 = ck_tile::number<3>{}; + // constexpr auto I3 = ck_tile::number<3>{}; auto profile = [&](auto num_dim_spatial_tmp, auto in_layout, @@ -146,14 +146,14 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[]) using ComputeTypeB = decltype(compute_type_b); bool pass = ck_tile::profiler::profile_grouped_conv_bwd_data_impl( + InLayout, + WeiLayout, + OutLayout, + InDataType, + WeiDataType, + OutDataType, + ComputeTypeA, + ComputeTypeB>( do_verification, init_method, do_log, time_kernel, params, split_k); return pass ? 0 : 1; @@ -179,7 +179,7 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[]) return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); } } - + // if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) // { // if(data_type == ConvDataType::F32_F32_F32) @@ -193,7 +193,8 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[]) // if(data_type == ConvDataType::BF16_F32_BF16) // { // // fp32 atomic add is used for weight tensor in bf16 kernel - // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{}); + // return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, + // BF16{}); // } // if(data_type == ConvDataType::BF16_BF16_BF16) // { @@ -207,7 +208,8 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[]) // else if(data_type == ConvDataType::I8_I8_I8) // { // return profile( - // I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}); + // I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, + // int8_t{}); // } // } diff --git a/profiler/ck_tile/src/tile_profile_grouped_conv_bwd_weight.cpp b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_weight.cpp similarity index 90% rename from profiler/ck_tile/src/tile_profile_grouped_conv_bwd_weight.cpp rename to experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_weight.cpp index 6e56ece96b..15ddcbb655 100644 --- a/profiler/ck_tile/src/tile_profile_grouped_conv_bwd_weight.cpp +++ b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_bwd_weight.cpp @@ -43,14 +43,14 @@ static void print_helper_msg() std::string conv_param_parser_helper_msg; conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n" - " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" - " G, N, K, C, \n" - " , (ie Y, X for 2D)\n" - " , (ie Hi, Wi for 2D)\n" - " , (ie Sy, Sx for 2D)\n" - " , (ie Dy, Dx for 2D)\n" - " , (ie LeftPy, LeftPx for 2D)\n" - " , (ie RightPy, RightPx for 2D)\n"; + " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + " G, N, K, C, \n" + " , (ie Y, X for 2D)\n" + " , (ie Hi, Wi for 2D)\n" + " , (ie Sy, Sx for 2D)\n" + " , (ie Dy, Dx for 2D)\n" + " , (ie LeftPy, LeftPx for 2D)\n" + " , (ie RightPy, RightPx for 2D)\n"; std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n" << "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n" @@ -151,14 +151,14 @@ int tile_profile_grouped_conv_bwd_weight(int argc, char* argv[]) using ComputeTypeB = decltype(compute_type_b); bool pass = ck_tile::profiler::profile_grouped_conv_bwd_weight_impl( + InLayout, + WeiLayout, + OutLayout, + InDataType, + WeiDataType, + OutDataType, + ComputeTypeA, + ComputeTypeB>( do_verification, init_method, do_log, time_kernel, params, split_k); return pass ? 0 : 1; @@ -184,7 +184,7 @@ int tile_profile_grouped_conv_bwd_weight(int argc, char* argv[]) return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); } } - + if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) { if(data_type == ConvDataType::F32_F32_F32) diff --git a/profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp similarity index 87% rename from profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp rename to experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp index bb447039ea..283e6e7fc5 100644 --- a/profiler/ck_tile/src/tile_profile_grouped_conv_fwd.cpp +++ b/experimental/ck_tile_profiler/src/tile_profile_grouped_conv_fwd.cpp @@ -25,10 +25,10 @@ enum struct ConvLayout enum struct ConvDataType { - F32_F32_F32, // 0 - F16_F16_F16, // 1 - BF16_BF16_BF16, // 2 - I8_I8_I8, // 3 + F32_F32_F32, // 0 + F16_F16_F16, // 1 + BF16_BF16_BF16, // 2 + I8_I8_I8, // 3 }; #define OP_NAME "grouped_conv_fwd" @@ -39,14 +39,14 @@ static void print_helper_msg() std::string conv_param_parser_helper_msg; conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n" - " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" - " G, N, K, C, \n" - " , (ie Y, X for 2D)\n" - " , (ie Hi, Wi for 2D)\n" - " , (ie Sy, Sx for 2D)\n" - " , (ie Dy, Dx for 2D)\n" - " , (ie LeftPy, LeftPx for 2D)\n" - " , (ie RightPy, RightPx for 2D)\n"; + " Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n" + " G, N, K, C, \n" + " , (ie Y, X for 2D)\n" + " , (ie Hi, Wi for 2D)\n" + " , (ie Sy, Sx for 2D)\n" + " , (ie Dy, Dx for 2D)\n" + " , (ie LeftPy, LeftPx for 2D)\n" + " , (ie RightPy, RightPx for 2D)\n"; std::cout // clang-format off @@ -101,7 +101,7 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[]) return 1; } - const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv); + const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv); constexpr ck_tile::index_t k_batch = 1; using F32 = float; @@ -143,14 +143,14 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[]) using ComputeTypeB = decltype(compute_type_b); bool pass = ck_tile::profiler::profile_grouped_conv_fwd_impl( + InLayout, + WeiLayout, + OutLayout, + InDataType, + WeiDataType, + OutDataType, + ComputeTypeA, + ComputeTypeB>( do_verification, init_method, do_log, time_kernel, params, k_batch); return pass ? 0 : 1; @@ -171,7 +171,7 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[]) return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{}); } } - + if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK) { if(data_type == ConvDataType::F32_F32_F32) diff --git a/profiler/ck_tile/src/tile_profiler.cpp b/experimental/ck_tile_profiler/src/tile_profiler.cpp similarity index 100% rename from profiler/ck_tile/src/tile_profiler.cpp rename to experimental/ck_tile_profiler/src/tile_profiler.cpp diff --git a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp index 71739c9083..bbf333efab 100644 --- a/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp +++ b/include/ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp @@ -5,6 +5,8 @@ #include "ck_tile/core.hpp" #include "ck_tile/host/convolution_parameter.hpp" +#include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp" +#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp" #include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp" namespace ck_tile { diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index 083d2e4b1e..0ed9785edf 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -3,3 +3,4 @@ add_subdirectory(src/tensor_operation_instance/gpu) add_subdirectory(src/utility) +add_subdirectory(src/ck_tile/tensor_operation_instance/gpu) diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp new file mode 100644 index 0000000000..f8db4c0464 --- /dev/null +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include + +#include "ck_tile/core.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include "ck_tile/ops/epilogue.hpp" +#include "ck_tile/ops/gemm.hpp" + +#define CK_TILE_PIPELINE_COMPUTE_V3 1 +#define CK_TILE_PIPELINE_MEMORY 2 +#define CK_TILE_PIPELINE_COMPUTE_V4 3 +#define CK_TILE_PIPELINE_COMPUTE_V5 4 + +namespace ck_tile { +namespace ops { + +using MemoryOpSet = + std::integral_constant; + +using MemoryOpAtomicAdd = std::integral_constant; + +struct GemmConfigBase +{ + static constexpr bool kPadM = true; + static constexpr bool kPadN = true; + static constexpr bool kPadK = true; + + static constexpr bool PermuteA = false; + static constexpr bool PermuteB = false; + + static constexpr bool TransposeC = false; + static constexpr bool UseStructuredSparsity = false; + + static constexpr int kBlockPerCu = 1; + static constexpr ck_tile::index_t TileParitionerGroupNum = 8; + static constexpr ck_tile::index_t TileParitionerM01 = 4; + static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave; + static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3; + static constexpr ck_tile::index_t NumWaveGroups = 1; + static constexpr bool Preshuffle = false; + static constexpr bool TiledMMAPermuteN = false; +}; + +template +struct PipelineTypeTraits; + +template <> +struct PipelineTypeTraits +{ + template + using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem; + template + using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem; +}; + +template <> +struct PipelineTypeTraits +{ + template + using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3; + template + using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3; +}; + +template <> +struct PipelineTypeTraits +{ + template + using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4; + template + using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4; +}; + +template <> +struct PipelineTypeTraits +{ + template + using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5; + template + using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5; +}; + +} // namespace ops +} // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.hpp index ac35ae8cc1..9465d40dbb 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances.hpp @@ -11,32 +11,29 @@ namespace ops { using BF16 = ck_tile::bfloat16_t; using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, - NHWGC, - GKYXC, - NHWGK, - BF16, - BF16, - BF16, - PassThrough, - PassThrough, - PassThrough, - BF16, - BF16>; + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; -template +template using tile_grouped_conv_bwd_data_bf16_instances = std::tuple< -// clang-format off + // clang-format off //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, GroupedConvolutionBackwardDataInvoker -// clang-format on ->; + // clang-format on + >; } // namespace ops } // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_2.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_2.hpp index 548fffbacf..66074fd92a 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_2.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_2.hpp @@ -11,58 +11,55 @@ namespace ops { using BF16 = ck_tile::bfloat16_t; using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, - NHWGC, - GKYXC, - NHWGK, - BF16, - BF16, - BF16, - PassThrough, - PassThrough, - PassThrough, - BF16, - BF16>; + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; -template +template using tile_grouped_conv_bwd_data_bf16_instances_2 = std::tuple< -// clang-format off + // clang-format off //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, // prob this -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker -// clang-format on ->; +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, // prob this +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; } // namespace ops } // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_3.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_3.hpp index f45a181441..f5cc80955f 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_3.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_3.hpp @@ -11,61 +11,58 @@ namespace ops { using BF16 = ck_tile::bfloat16_t; using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, - NHWGC, - GKYXC, - NHWGK, - BF16, - BF16, - BF16, - PassThrough, - PassThrough, - PassThrough, - BF16, - BF16>; + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; -template +template using tile_grouped_conv_bwd_data_bf16_instances_3 = std::tuple< -// clang-format off + // clang-format off //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, // ta -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker -// clang-format on ->; +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, // ta +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; } // namespace ops } // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_4.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_4.hpp index 6ca0c42ff3..ae0d451fe7 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_4.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_4.hpp @@ -11,58 +11,55 @@ namespace ops { using BF16 = ck_tile::bfloat16_t; using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, - NHWGC, - GKYXC, - NHWGK, - BF16, - BF16, - BF16, - PassThrough, - PassThrough, - PassThrough, - BF16, - BF16>; + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; -template +template using tile_grouped_conv_bwd_data_bf16_instances_4 = std::tuple< -// clang-format off + // clang-format off //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker -// clang-format on ->; +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; } // namespace ops } // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_5.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_5.hpp index 591fc43c51..a48988e2ce 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_5.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_5.hpp @@ -11,62 +11,59 @@ namespace ops { using BF16 = ck_tile::bfloat16_t; using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, - NHWGC, - GKYXC, - NHWGK, - BF16, - BF16, - BF16, - PassThrough, - PassThrough, - PassThrough, - BF16, - BF16>; + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; -template +template using tile_grouped_conv_bwd_data_bf16_instances_5 = std::tuple< -// clang-format off + // clang-format off //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, // ta -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker -// clang-format on ->; +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, // ta +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; } // namespace ops } // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_6.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_6.hpp index 58a9e0c203..daee80d8e3 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_6.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_6.hpp @@ -11,60 +11,57 @@ namespace ops { using BF16 = ck_tile::bfloat16_t; using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, - NHWGC, - GKYXC, - NHWGK, - BF16, - BF16, - BF16, - PassThrough, - PassThrough, - PassThrough, - BF16, - BF16>; + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; -template +template using tile_grouped_conv_bwd_data_bf16_instances_6 = std::tuple< -// clang-format off + // clang-format off //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker -// clang-format on ->; +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; } // namespace ops } // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_7.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_7.hpp index 7df82292d5..12918fa110 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_7.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_bf16_instances_7.hpp @@ -11,64 +11,61 @@ namespace ops { using BF16 = ck_tile::bfloat16_t; using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2, - NHWGC, - GKYXC, - NHWGK, - BF16, - BF16, - BF16, - PassThrough, - PassThrough, - PassThrough, - BF16, - BF16>; + NHWGC, + GKYXC, + NHWGK, + BF16, + BF16, + BF16, + PassThrough, + PassThrough, + PassThrough, + BF16, + BF16>; -template +template using tile_grouped_conv_bwd_data_bf16_instances_7 = std::tuple< -// clang-format off + // clang-format off //###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM| //###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline| //###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version| -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, // prob this -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker, -GroupedConvolutionBackwardDataInvoker -// clang-format on ->; +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, // prob this +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker, +// GroupedConvolutionBackwardDataInvoker + // clang-format on + >; } // namespace ops } // namespace ck_tile diff --git a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_invoker.hpp b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_invoker.hpp index 8aac4ebc1e..125b68f197 100644 --- a/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_invoker.hpp +++ b/library/include/ck_tile/library/tensor_operation_instance/gpu/tile_grouped_conv_bwd_data_invoker.hpp @@ -11,7 +11,7 @@ #include "ck_tile/host/kernel_launch.hpp" #include "ck_tile/ops/epilogue.hpp" #include "ck_tile/ops/gemm.hpp" -#include "ck_tile/ops/grouped_convolution.hpp" +#include "ck_tile/ops/grouped_convolution.hpp" #include "ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp" namespace ck_tile { @@ -31,192 +31,209 @@ template struct GroupedConvolutionBackwardDataBaseInvoker { - virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0; - virtual float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0; - virtual std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0; + virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0; + virtual float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, + bool time_kernel, + int n_warmup, + int n_repeat) const = 0; + virtual std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0; GroupedConvolutionBackwardDataBaseInvoker() = default; - GroupedConvolutionBackwardDataBaseInvoker(const GroupedConvolutionBackwardDataBaseInvoker&) = default; - GroupedConvolutionBackwardDataBaseInvoker& operator=(const GroupedConvolutionBackwardDataBaseInvoker&) = default; - GroupedConvolutionBackwardDataBaseInvoker(GroupedConvolutionBackwardDataBaseInvoker&&) = default; - GroupedConvolutionBackwardDataBaseInvoker& operator=(GroupedConvolutionBackwardDataBaseInvoker&&) = default; - virtual ~GroupedConvolutionBackwardDataBaseInvoker() = default; + GroupedConvolutionBackwardDataBaseInvoker(const GroupedConvolutionBackwardDataBaseInvoker&) = + default; + GroupedConvolutionBackwardDataBaseInvoker& + operator=(const GroupedConvolutionBackwardDataBaseInvoker&) = default; + GroupedConvolutionBackwardDataBaseInvoker(GroupedConvolutionBackwardDataBaseInvoker&&) = + default; + GroupedConvolutionBackwardDataBaseInvoker& + operator=(GroupedConvolutionBackwardDataBaseInvoker&&) = default; + virtual ~GroupedConvolutionBackwardDataBaseInvoker() = default; }; -template < - ck_tile::index_t NDimSpatial, - typename InLayout, - typename WeiLayout, - typename OutLayout, - typename InDataType, - typename WeiDataType, - typename OutDataType, - typename InElementwiseOperation, - typename WeiElementwiseOperation, - typename OutElementwiseOperation, - ck_tile::ConvolutionSpecialization ConvSpec, - int kBlockPerCu, - ck_tile::index_t M_Tile, - ck_tile::index_t N_Tile, - ck_tile::index_t K_Tile, - ck_tile::index_t M_Warp, - ck_tile::index_t N_Warp, - ck_tile::index_t K_Warp, - ck_tile::index_t M_Warp_Tile, - ck_tile::index_t N_Warp_Tile, - ck_tile::index_t K_Warp_Tile, - ck_tile::index_t VectorSizeA, - ck_tile::index_t VectorSizeB, - ck_tile::index_t VectorSizeC, - bool DoubleSmemBuffer, - ck_tile::index_t PipelineVersion> -struct GroupedConvolutionBackwardDataInvoker : - public GroupedConvolutionBackwardDataBaseInvoker +template +struct GroupedConvolutionBackwardDataInvoker + : public GroupedConvolutionBackwardDataBaseInvoker { - using GemmShape = ck_tile::TileGemmShape< - ck_tile::sequence, - ck_tile::sequence, - ck_tile::sequence, - GemmConfigBase::PermuteA, - GemmConfigBase::PermuteB>; + using GemmShape = + ck_tile::TileGemmShape, + ck_tile::sequence, + ck_tile::sequence, + GemmConfigBase::PermuteA, + GemmConfigBase::PermuteB>; - //static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; + // static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default; using TilePartitioner = - ck_tile::GemmSpatiallyLocalTilePartitioner; + ck_tile::GemmSpatiallyLocalTilePartitioner; using GroupedConvTraitsType = ck_tile::GroupedConvTraits, // = DsLayout - OutLayout, - VectorSizeA, - VectorSizeB, - VectorSizeC>; + ConvSpec, + InLayout, + WeiLayout, + ck_tile::tuple<>, // = DsLayout + OutLayout, + VectorSizeA, + VectorSizeB, + VectorSizeC>; using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits< - GemmConfigBase::kPadM, - GemmConfigBase::kPadN, - GemmConfigBase::kPadK, - DoubleSmemBuffer, - typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::AsLayout, - typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::BsLayout, - typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::CLayout, - GemmConfigBase::TransposeC, - GemmConfigBase::UseStructuredSparsity, - false, // Persistent, - GemmConfigBase::NumWaveGroups>; + GemmConfigBase::kPadM, + GemmConfigBase::kPadN, + GemmConfigBase::kPadK, + DoubleSmemBuffer, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::AsLayout, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::BsLayout, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::CLayout, + GemmConfigBase::TransposeC, + GemmConfigBase::UseStructuredSparsity, + false, // Persistent, + GemmConfigBase::NumWaveGroups>; - using AccDataType = float; + using AccDataType = float; using GemmPipelineProblem = ck_tile::GemmPipelineProblem< - OutDataType, - WeiDataType, - AccDataType, - GemmShape, - typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData, - ck_tile::element_wise::PassThrough, - ck_tile::element_wise::PassThrough, - InDataType, - true, - VectorSizeA, - VectorSizeB>; + OutDataType, + WeiDataType, + AccDataType, + GemmShape, + typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + InDataType, + true, + VectorSizeA, + VectorSizeB>; + + using BaseGemmPipeline = typename PipelineTypeTraits< + PipelineVersion>::template UniversalGemmPipeline; - using BaseGemmPipeline = typename PipelineTypeTraits::template UniversalGemmPipeline; - template auto CreateKernel() const { constexpr auto scheduler = GemmConfigBase::Scheduler; - + using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem; + WeiDataType, + AccDataType, + GemmShape, + GemmUniversalTraits, + scheduler, + // HasHotLoop, + // TailNumber, + ck_tile::element_wise::PassThrough, + ck_tile::element_wise::PassThrough, + InDataType, + true, + VectorSizeA, + VectorSizeB>; - using GemmPipeline = typename PipelineTypeTraits::template GemmPipeline; + using GemmPipeline = typename PipelineTypeTraits::template GemmPipeline< + UniversalGemmProblem>; using CDEElementWise = ck_tile::element_wise::PassThrough; - using ConvEpilogue = ck_tile::CShuffleEpilogue, // = DsDataType - AccDataType, - InDataType, - typename GroupedConvTraitsType::ImplicitGemmDsLayout, - ck_tile::tensor_layout::gemm::RowMajor, - CDEElementWise, - TilePartitioner::MPerBlock, - TilePartitioner::NPerBlock, - M_Warp, - N_Warp, - M_Warp_Tile, - N_Warp_Tile, - K_Warp_Tile, - GemmConfigBase::TransposeC, - MemOp, - 1, - true, - GroupedConvTraitsType::VectorSizeC>>; + using ConvEpilogue = ck_tile::CShuffleEpilogue< + ck_tile::CShuffleEpilogueProblem, // = DsDataType + AccDataType, + InDataType, + typename GroupedConvTraitsType::ImplicitGemmDsLayout, + ck_tile::tensor_layout::gemm::RowMajor, + CDEElementWise, + TilePartitioner::MPerBlock, + TilePartitioner::NPerBlock, + M_Warp, + N_Warp, + M_Warp_Tile, + N_Warp_Tile, + K_Warp_Tile, + GemmConfigBase::TransposeC, + MemOp, + 1, + true, + GroupedConvTraitsType::VectorSizeC>>; return ck_tile::GroupedConvolutionBackwardDataKernel{}; + TilePartitioner, + GemmPipeline, + ConvEpilogue>{}; } bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const override { - if (args.k_batch > 1) + if(args.k_batch > 1) { - using Kernel = decltype(CreateKernel()); - return Kernel::IsSupportedArgument(args); + using Kernel = decltype(CreateKernel()); + return Kernel::IsSupportedArgument(args); } - using Kernel = decltype(CreateKernel()); + using Kernel = decltype(CreateKernel()); return Kernel::IsSupportedArgument(args); }; - float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override + float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, + bool time_kernel, + int n_warmup = 5, + int n_repeat = 50) const override { - [[maybe_unused]] ck_tile::index_t KGroups = 1; - for(int i=0; i(); - using Kernel = decltype(kernel); - - auto kargs = Kernel::MakeKernelArgs(args); - const dim3 grids = Kernel::GridSize(args); - const dim3 blocks = Kernel::BlockSize(); + auto kernel = CreateKernel(); + using Kernel = decltype(kernel); - printf("grid: (%u, %u, %u)\n", grids.x, grids.y, grids.z); - - ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat}; + auto kargs = Kernel::MakeKernelArgs(args); + const dim3 grids = Kernel::GridSize(args); + const dim3 blocks = Kernel::BlockSize(); - ave_time = ck_tile::launch_kernel( - s, ck_tile::make_kernel(kernel, grids, blocks, 0, kargs)); + printf("grid: (%u, %u, %u)\n", grids.x, grids.y, grids.z); - return ave_time; - }; + ck_tile::stream_config s{nullptr, time_kernel, 1, n_warmup, n_repeat}; + + ave_time = ck_tile::launch_kernel( + s, ck_tile::make_kernel(kernel, grids, blocks, 0, kargs)); + + return ave_time; + }; const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) { if(args.k_batch == 1) @@ -272,22 +288,28 @@ struct GroupedConvolutionBackwardDataInvoker : { std::stringstream min_occupancy; min_occupancy << "_blk_per_cu_" << kBlockPerCu; - if (args.k_batch > 1) + if(args.k_batch > 1) { - using Kernel = decltype(CreateKernel()); - return Kernel::GetName() + min_occupancy.str(); + using Kernel = decltype(CreateKernel()); + return Kernel::GetName() + min_occupancy.str(); } - using Kernel = decltype(CreateKernel()); + using Kernel = decltype(CreateKernel()); return Kernel::GetName() + min_occupancy.str(); }; - GroupedConvolutionBackwardDataInvoker() = default; + GroupedConvolutionBackwardDataInvoker() = default; GroupedConvolutionBackwardDataInvoker(const GroupedConvolutionBackwardDataInvoker&) = default; - GroupedConvolutionBackwardDataInvoker& operator=(const GroupedConvolutionBackwardDataInvoker&) = default; + GroupedConvolutionBackwardDataInvoker& + operator=(const GroupedConvolutionBackwardDataInvoker&) = default; GroupedConvolutionBackwardDataInvoker(GroupedConvolutionBackwardDataInvoker&&) = default; - GroupedConvolutionBackwardDataInvoker& operator=(GroupedConvolutionBackwardDataInvoker&&) = default; - ~GroupedConvolutionBackwardDataInvoker() override = default; - }; + GroupedConvolutionBackwardDataInvoker& + operator=(GroupedConvolutionBackwardDataInvoker&&) = default; + ~GroupedConvolutionBackwardDataInvoker() override = default; +}; -} -} +} // namespace ops +} // namespace ck_tile diff --git a/profiler/CMakeLists.txt b/profiler/CMakeLists.txt index c17f83eb48..15ed3c8c67 100644 --- a/profiler/CMakeLists.txt +++ b/profiler/CMakeLists.txt @@ -6,4 +6,3 @@ include_directories(BEFORE ) add_subdirectory(src) -add_subdirectory(ck_tile)