moved ck tile profiler to experimental

This commit is contained in:
Jakub Piasecki
2025-12-09 15:12:29 +00:00
parent 65b6d8efcd
commit e2dd65218d
23 changed files with 862 additions and 749 deletions

View File

@@ -41,6 +41,7 @@ include(CTest)
option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON)
option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF)
option(CK_EXPERIMENTAL_PROFILER "Enable experimental profiler" ON)
option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
option(FORCE_DISABLE_XDL "Skip compiling XDL specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)
option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)
@@ -50,6 +51,12 @@ if(CK_EXPERIMENTAL_BUILDER)
include_directories(${PROJECT_SOURCE_DIR}/experimental/builder/include)
endif()
if(CK_EXPERIMENTAL_PROFILER)
add_definitions(-DCK_EXPERIMENTAL_PROFILEr)
include_directories(${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include)
endif()
# Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
# CK Codegen requires dataclass which is added in Python 3.7
# Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
@@ -729,6 +736,10 @@ if (CK_EXPERIMENTAL_BUILDER)
add_subdirectory(experimental/builder)
endif()
if (CK_EXPERIMENTAL_PROFILER)
add_subdirectory(experimental/ck_tile_profiler)
endif()
if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
add_subdirectory(codegen)
endif()
@@ -768,6 +779,13 @@ if(CK_EXPERIMENTAL_BUILDER)
)
endif()
if(CK_EXPERIMENTAL_PROFILER)
rocm_install(DIRECTORY
${PROJECT_SOURCE_DIR}/ck_tile_profiler
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile
)
endif()
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
set(CPACK_RPM_PACKAGE_LICENSE "MIT")

View File

@@ -2,4 +2,4 @@ include_directories(BEFORE
${CMAKE_CURRENT_LIST_DIR}/include
)
add_subdirectory(src)
add_subdirectory(src)

View File

@@ -52,12 +52,12 @@ template <ck_tile::index_t NDimSpatial,
typename ComputeTypeA = InDataType,
typename ComputeTypeB = ComputeTypeA>
bool profile_grouped_conv_bwd_data_impl(int do_verification,
int init_method,
bool /*do_log*/,
bool time_kernel,
const ck_tile::conv::ConvParam& conv_param,
const std::string& split_k,
ck_tile::index_t instance_index = -1)
int init_method,
bool /*do_log*/,
bool time_kernel,
const ck_tile::conv::ConvParam& conv_param,
const std::string& split_k,
ck_tile::index_t instance_index = -1)
{
using AccDataType = float;
using InElementOp = ck_tile::element_wise::PassThrough;
@@ -93,24 +93,21 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
ck_tile::FillUniformDistribution<WeiDataType>{-1.f, 1.f}(weight);
ck_tile::FillUniformDistribution<OutDataType>{-1.f, 1.f}(output);
break;
default:
weight.SetZero();
output.SetZero();
default: weight.SetZero(); output.SetZero();
}
using DeviceOp = ops::GroupedConvolutionBackwardDataBaseInvoker<
NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ComputeTypeA,
ComputeTypeB>;
using DeviceOp = ops::GroupedConvolutionBackwardDataBaseInvoker<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ComputeTypeA,
ComputeTypeB>;
// get device op instances
const auto ops = ck_tile::ops::DeviceOperationInstanceFactory<DeviceOp>::GetInstances();
@@ -123,14 +120,15 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
float best_gb_per_sec = 0;
std::string best_split_k("1");
// std::vector<ck_tile::index_t> split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304};
// std::vector<ck_tile::index_t> split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76,
// 128, 152, 256, 304};
std::vector<ck_tile::index_t> split_k_list = {1, 2, 3, 4, 6, 8, 12, 16};
if(split_k != "all")
{
try
{
ck_tile::index_t split_k_value = std::stoi(split_k);
split_k_list = {split_k_value};
split_k_list = {split_k_value};
}
catch(const std::exception& e)
{
@@ -142,21 +140,21 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
// First, calculate the reference result if verification is needed.
ck_tile::HostTensor<InDataType> input_host_ref(in_g_n_c_wis_desc);
input_host_ref.SetZero();
if (do_verification)
if(do_verification)
{
ck_tile::reference_grouped_conv_bwd_data<NDimSpatial, InDataType, WeiDataType, OutDataType>(
input_host_ref,
weight,
output,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_);
input_host_ref,
weight,
output,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_);
}
//instance_index = 0;
// instance_index = 0;
index_t num_kernel = 0;
bool all_pass = true;
bool all_pass = true;
for(auto& op : ops)
{
for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
@@ -173,13 +171,13 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
output_dev_buf.ToDevice(output.data());
ck_tile::GroupedConvBwdDataHostArgs args(conv_param,
input_dev_buf.GetDeviceBuffer(),
weight_dev_buf.GetDeviceBuffer(),
{},
output_dev_buf.GetDeviceBuffer(),
split_k_value);
input_dev_buf.GetDeviceBuffer(),
weight_dev_buf.GetDeviceBuffer(),
{},
output_dev_buf.GetDeviceBuffer(),
split_k_value);
// Split-K autodeduction is not supported.
// Split-K autodeduction is not supported.
if(op->IsSupportedArgument(args) && split_k_value >= 1)
{
num_kernel++;
@@ -190,10 +188,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
}
std::string op_name = op->GetName(args);
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..."
<< std::endl;
// Run verification first. If it doesn't pass, no need to do performance measurement.
bool pass = false;
// Run verification first. If it doesn't pass, no need to do performance
// measurement.
bool pass = false;
if(do_verification)
{
constexpr int n_warmup = 0;
@@ -202,8 +202,9 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
op->Run(args, false, n_warmup, n_repeat);
input_dev_buf.FromDevice(input.data());
const ck_tile::index_t GemmK = conv_param.K_
* conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1];
const ck_tile::index_t GemmK = conv_param.K_ *
conv_param.filter_spatial_lengths_[0] *
conv_param.filter_spatial_lengths_[1];
const float max_accumulated_value =
*std::max_element(input_host_ref.mData.begin(), input_host_ref.mData.end());
@@ -212,37 +213,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
GemmK, split_k_value, max_accumulated_value);
pass = ck_tile::check_err(input,
input_host_ref,
"Error: Incorrect results!",
rtol_atol.at(ck_tile::number<0>{}),
rtol_atol.at(ck_tile::number<1>{}));
input_host_ref,
"Error: Incorrect results!",
rtol_atol.at(ck_tile::number<0>{}),
rtol_atol.at(ck_tile::number<1>{}));
std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
<< std::endl;
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
<< std::endl;
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail")
<< std::endl;
all_pass &= pass;
}
bool is_valid = do_verification ? pass : true;
if (is_valid)
if(is_valid)
{
constexpr int n_warmup = 5;
constexpr int n_repeat = 50;
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype =
conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
<< split_k_param_str << std::endl;
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
<< split_k_param_str << std::endl;
if(tflops > best_tflops)
{
best_op_name = op_name;
@@ -253,7 +256,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
}
}
}
else
else
{
std::cout << op->GetName(args) << ", SplitK " << split_k_param_str
<< " does not support this problem." << std::endl;
@@ -262,14 +265,11 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
}
std::stringstream ss;
ss << "\n********************************"
<< "\nCK Tile best configuration parameters:"
<< "\n********************************"
<< "\nname: " << best_op_name
<< "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
<< "\nGB/s: " << best_gb_per_sec
<< "\nSplitK: " << best_split_k
<< std::endl;
ss << "\n********************************"
<< "\nCK Tile best configuration parameters:" << "\n********************************"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec
<< "\nSplitK: " << best_split_k << std::endl;
std::cout << ss.str();

View File

@@ -93,24 +93,21 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
ck_tile::FillUniformDistribution<InDataType>{0.f, 1.f}(input);
ck_tile::FillUniformDistribution<OutDataType>{0.f, 1.f}(output);
break;
default:
input.SetZero();
output.SetZero();
default: input.SetZero(); output.SetZero();
}
using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker<
NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ComputeTypeA,
ComputeTypeB>;
using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ComputeTypeA,
ComputeTypeB>;
// get device op instances
const auto ops = ck_tile::ops::DeviceOperationInstanceFactory<DeviceOp>::GetInstances();
@@ -123,13 +120,14 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
float best_gb_per_sec = 0;
std::string best_split_k("1");
std::vector<ck_tile::index_t> split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304};
std::vector<ck_tile::index_t> split_k_list = {
1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304};
if(split_k != "all")
{
try
{
ck_tile::index_t split_k_value = std::stoi(split_k);
split_k_list = {split_k_value};
split_k_list = {split_k_value};
}
catch(const std::exception& e)
{
@@ -141,21 +139,21 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
// First, calculate the reference result if verification is needed.
ck_tile::HostTensor<WeiDataType> weight_host_ref(wei_g_k_c_xs_desc);
weight_host_ref.SetZero();
if (do_verification)
if(do_verification)
{
ck_tile::reference_grouped_conv_bwd_weight<NDimSpatial, InDataType, WeiDataType, OutDataType>(
input,
weight_host_ref,
output,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_);
ck_tile::
reference_grouped_conv_bwd_weight<NDimSpatial, InDataType, WeiDataType, OutDataType>(
input,
weight_host_ref,
output,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_);
}
index_t num_kernel = 0;
bool all_pass = true;
bool all_pass = true;
for(auto& op : ops)
{
for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
@@ -172,13 +170,13 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
output_dev_buf.ToDevice(output.data());
ck_tile::GroupedConvBwdWeightHostArgs args(conv_param,
input_dev_buf.GetDeviceBuffer(),
weight_dev_buf.GetDeviceBuffer(),
{},
output_dev_buf.GetDeviceBuffer(),
split_k_value);
input_dev_buf.GetDeviceBuffer(),
weight_dev_buf.GetDeviceBuffer(),
{},
output_dev_buf.GetDeviceBuffer(),
split_k_value);
// Split-K autodeduction is not supported.
// Split-K autodeduction is not supported.
if(op->IsSupportedArgument(args) && split_k_value >= 1)
{
num_kernel++;
@@ -189,10 +187,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
}
std::string op_name = op->GetName(args);
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..."
<< std::endl;
// Run verification first. If it doesn't pass, no need to do performance measurement.
bool pass = false;
// Run verification first. If it doesn't pass, no need to do performance
// measurement.
bool pass = false;
if(do_verification)
{
constexpr int n_warmup = 0;
@@ -201,45 +201,48 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
op->Run(args, false, n_warmup, n_repeat);
weight_dev_buf.FromDevice(weight.data());
const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
const float max_accumulated_value =
*std::max_element(weight_host_ref.mData.begin(), weight_host_ref.mData.end());
const ck_tile::index_t GemmK =
weight.get_element_size() / (conv_param.G_ * conv_param.K_);
const float max_accumulated_value = *std::max_element(
weight_host_ref.mData.begin(), weight_host_ref.mData.end());
const auto rtol_atol =
calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
GemmK, split_k_value, max_accumulated_value);
pass = ck_tile::check_err(weight,
weight_host_ref,
"Error: Incorrect results!",
rtol_atol.at(ck_tile::number<0>{}),
rtol_atol.at(ck_tile::number<1>{}));
weight_host_ref,
"Error: Incorrect results!",
rtol_atol.at(ck_tile::number<0>{}),
rtol_atol.at(ck_tile::number<1>{}));
std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
<< std::endl;
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
<< std::endl;
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail")
<< std::endl;
all_pass &= pass;
}
bool is_valid = do_verification ? pass : true;
if (is_valid)
if(is_valid)
{
constexpr int n_warmup = 5;
constexpr int n_repeat = 50;
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype =
conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
<< split_k_param_str << std::endl;
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
<< split_k_param_str << std::endl;
if(tflops > best_tflops)
{
best_op_name = op_name;
@@ -250,7 +253,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
}
}
}
else
else
{
std::cout << op->GetName(args) << ", SplitK " << split_k_param_str
<< " does not support this problem." << std::endl;
@@ -259,14 +262,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
}
std::stringstream ss;
ss << "\n********************************"
<< "\nCK Tile best configuration parameters:"
<< "\n********************************"
<< "\nname: " << best_op_name
<< "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
<< "\nGB/s: " << best_gb_per_sec
<< "\nSplitK: " << best_split_k
<< std::endl;
ss << "\n********************************"
<< "\nCK Tile best configuration parameters:" << "\n********************************"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec
<< "\nSplitK: " << best_split_k << std::endl;
std::cout << ss.str();

View File

@@ -52,12 +52,12 @@ template <ck_tile::index_t NDimSpatial,
typename ComputeTypeA = InDataType,
typename ComputeTypeB = ComputeTypeA>
bool profile_grouped_conv_fwd_impl(int do_verification,
int init_method,
bool /*do_log*/,
bool time_kernel,
const ck_tile::conv::ConvParam& conv_param,
const ck_tile::index_t k_batch,
ck_tile::index_t instance_index = -1)
int init_method,
bool /*do_log*/,
bool time_kernel,
const ck_tile::conv::ConvParam& conv_param,
const ck_tile::index_t k_batch,
ck_tile::index_t instance_index = -1)
{
using AccDataType = float;
using InElementOp = ck_tile::element_wise::PassThrough;
@@ -100,19 +100,18 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
weight.SetZero();
}
using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker<
NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ComputeTypeA,
ComputeTypeB>;
using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementOp,
WeiElementOp,
OutElementOp,
ComputeTypeA,
ComputeTypeB>;
// get device op instances
const auto ops = ck_tile::ops::DeviceOperationInstanceFactory<DeviceOp>::GetInstances();
@@ -125,7 +124,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
float best_gb_per_sec = 0;
index_t num_kernel = 0;
bool all_pass = true;
bool all_pass = true;
// tmp enforce instance
// instance_index = -1;
@@ -141,12 +140,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
output_dev_buf.SetZero();
ck_tile::GroupedConvFwdHostArgs args(conv_param,
input_dev_buf.GetDeviceBuffer(),
weight_dev_buf.GetDeviceBuffer(),
{},
output_dev_buf.GetDeviceBuffer(),
k_batch);
input_dev_buf.GetDeviceBuffer(),
weight_dev_buf.GetDeviceBuffer(),
{},
output_dev_buf.GetDeviceBuffer(),
k_batch);
if(op->IsSupportedArgument(args))
{
num_kernel++;
@@ -160,7 +159,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
std::cout << op_name << " is profiled..." << std::endl;
// Run verification first. If it doesn't pass, no need to do performance measurement.
bool pass = false;
bool pass = false;
if(do_verification)
{
constexpr int n_warmup = 0;
@@ -172,39 +171,42 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
output_host_ref.SetZero();
ck_tile::reference_grouped_conv_fwd<NDimSpatial, InDataType, WeiDataType, OutDataType>(
input,
weight,
output_host_ref,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_);
const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
ck_tile::
reference_grouped_conv_fwd<NDimSpatial, InDataType, WeiDataType, OutDataType>(
input,
weight,
output_host_ref,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_);
const ck_tile::index_t GemmK =
weight.get_element_size() / (conv_param.G_ * conv_param.K_);
const float max_accumulated_value =
*std::max_element(output_host_ref.mData.begin(), output_host_ref.mData.end());
const auto rtol_atol =
calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
GemmK, k_batch, max_accumulated_value);
pass = ck_tile::check_err(output,
output_host_ref,
"Error: Incorrect results!",
rtol_atol.at(ck_tile::number<0>{}),
rtol_atol.at(ck_tile::number<1>{}));
output_host_ref,
"Error: Incorrect results!",
rtol_atol.at(ck_tile::number<0>{}),
rtol_atol.at(ck_tile::number<1>{}));
std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
<< std::endl;
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
<< std::endl;
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail")
<< std::endl;
all_pass &= pass;
}
bool is_valid = do_verification ? pass : true;
if (is_valid)
if(is_valid)
{
constexpr int n_warmup = 5;
constexpr int n_repeat = 50;
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
std::size_t flop = conv_param.GetFlops();
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
@@ -213,7 +215,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
float gb_per_sec = num_btype / 1.E6 / avg_time;
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
if(tflops > best_tflops)
{
@@ -224,18 +226,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
}
}
}
else
else
{
//std::cout << op->GetName(args) << " does not support this problem." << std::endl;
// std::cout << op->GetName(args) << " does not support this problem." << std::endl;
}
}
std::cout << "\n********************************"
<< "\nBest configuration parameters:"
<< "\n********************************"
<< "\nname: " << best_op_name
<< "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
<< "\nGB/s: " << best_gb_per_sec << std::endl;
std::cout << "\n********************************"
<< "\nBest configuration parameters:" << "\n********************************"
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
const char* log_file = std::getenv("CK_TILE_PROFILER_LOG_FILE");
if(log_file != nullptr)

View File

@@ -25,9 +25,9 @@ enum struct ConvLayout
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
};
#define OP_NAME "grouped_conv_bwd_data"
@@ -38,14 +38,14 @@ static void print_helper_msg()
std::string conv_param_parser_helper_msg;
conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n"
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
" G, N, K, C, \n"
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
" <strides>, (ie Sy, Sx for 2D)\n"
" <dilations>, (ie Dy, Dx for 2D)\n"
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
" <right padding>, (ie RightPy, RightPx for 2D)\n";
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
" G, N, K, C, \n"
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
" <strides>, (ie Sy, Sx for 2D)\n"
" <dilations>, (ie Dy, Dx for 2D)\n"
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
" <right padding>, (ie RightPy, RightPx for 2D)\n";
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
@@ -111,17 +111,17 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
// using F8 = ck_tile::fp8_t;
// using BF8 = ck_tile::bf8_t;
using NHWGC = ck_tile::tensor_layout::convolution::NHWGC;
using NHWGC = ck_tile::tensor_layout::convolution::NHWGC;
// using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
using GKYXC = ck_tile::tensor_layout::convolution::GKYXC;
using GKYXC = ck_tile::tensor_layout::convolution::GKYXC;
// using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
using NHWGK = ck_tile::tensor_layout::convolution::NHWGK;
using NHWGK = ck_tile::tensor_layout::convolution::NHWGK;
// using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
constexpr auto I2 = ck_tile::number<2>{};
//constexpr auto I3 = ck_tile::number<3>{};
// constexpr auto I3 = ck_tile::number<3>{};
auto profile = [&](auto num_dim_spatial_tmp,
auto in_layout,
@@ -146,14 +146,14 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
using ComputeTypeB = decltype(compute_type_b);
bool pass = ck_tile::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
ComputeTypeA,
ComputeTypeB>(
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
ComputeTypeA,
ComputeTypeB>(
do_verification, init_method, do_log, time_kernel, params, split_k);
return pass ? 0 : 1;
@@ -179,7 +179,7 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
}
}
// if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
// {
// if(data_type == ConvDataType::F32_F32_F32)
@@ -193,7 +193,8 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
// if(data_type == ConvDataType::BF16_F32_BF16)
// {
// // fp32 atomic add is used for weight tensor in bf16 kernel
// return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
// return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{},
// BF16{});
// }
// if(data_type == ConvDataType::BF16_BF16_BF16)
// {
@@ -207,7 +208,8 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
// else if(data_type == ConvDataType::I8_I8_I8)
// {
// return profile(
// I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{});
// I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{},
// int8_t{});
// }
// }

View File

@@ -43,14 +43,14 @@ static void print_helper_msg()
std::string conv_param_parser_helper_msg;
conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n"
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
" G, N, K, C, \n"
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
" <strides>, (ie Sy, Sx for 2D)\n"
" <dilations>, (ie Dy, Dx for 2D)\n"
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
" <right padding>, (ie RightPy, RightPx for 2D)\n";
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
" G, N, K, C, \n"
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
" <strides>, (ie Sy, Sx for 2D)\n"
" <dilations>, (ie Dy, Dx for 2D)\n"
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
" <right padding>, (ie RightPy, RightPx for 2D)\n";
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
@@ -151,14 +151,14 @@ int tile_profile_grouped_conv_bwd_weight(int argc, char* argv[])
using ComputeTypeB = decltype(compute_type_b);
bool pass = ck_tile::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
ComputeTypeA,
ComputeTypeB>(
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
ComputeTypeA,
ComputeTypeB>(
do_verification, init_method, do_log, time_kernel, params, split_k);
return pass ? 0 : 1;
@@ -184,7 +184,7 @@ int tile_profile_grouped_conv_bwd_weight(int argc, char* argv[])
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
}
}
if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{
if(data_type == ConvDataType::F32_F32_F32)

View File

@@ -25,10 +25,10 @@ enum struct ConvLayout
enum struct ConvDataType
{
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
I8_I8_I8, // 3
F32_F32_F32, // 0
F16_F16_F16, // 1
BF16_BF16_BF16, // 2
I8_I8_I8, // 3
};
#define OP_NAME "grouped_conv_fwd"
@@ -39,14 +39,14 @@ static void print_helper_msg()
std::string conv_param_parser_helper_msg;
conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n"
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
" G, N, K, C, \n"
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
" <strides>, (ie Sy, Sx for 2D)\n"
" <dilations>, (ie Dy, Dx for 2D)\n"
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
" <right padding>, (ie RightPy, RightPx for 2D)\n";
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
" G, N, K, C, \n"
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
" <strides>, (ie Sy, Sx for 2D)\n"
" <dilations>, (ie Dy, Dx for 2D)\n"
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
" <right padding>, (ie RightPy, RightPx for 2D)\n";
std::cout
// clang-format off
@@ -101,7 +101,7 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
return 1;
}
const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv);
const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv);
constexpr ck_tile::index_t k_batch = 1;
using F32 = float;
@@ -143,14 +143,14 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
using ComputeTypeB = decltype(compute_type_b);
bool pass = ck_tile::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
ComputeTypeA,
ComputeTypeB>(
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
ComputeTypeA,
ComputeTypeB>(
do_verification, init_method, do_log, time_kernel, params, k_batch);
return pass ? 0 : 1;
@@ -171,7 +171,7 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
}
}
if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
{
if(data_type == ConvDataType::F32_F32_F32)

View File

@@ -5,6 +5,8 @@
#include "ck_tile/core.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
namespace ck_tile {

View File

@@ -3,3 +3,4 @@
add_subdirectory(src/tensor_operation_instance/gpu)
add_subdirectory(src/utility)
add_subdirectory(src/ck_tile/tensor_operation_instance/gpu)

View File

@@ -0,0 +1,90 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include <variant>
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/epilogue.hpp"
#include "ck_tile/ops/gemm.hpp"
#define CK_TILE_PIPELINE_COMPUTE_V3 1
#define CK_TILE_PIPELINE_MEMORY 2
#define CK_TILE_PIPELINE_COMPUTE_V4 3
#define CK_TILE_PIPELINE_COMPUTE_V5 4
namespace ck_tile {
namespace ops {
using MemoryOpSet =
std::integral_constant<ck_tile::memory_operation_enum, ck_tile::memory_operation_enum::set>;
using MemoryOpAtomicAdd = std::integral_constant<ck_tile::memory_operation_enum,
ck_tile::memory_operation_enum::atomic_add>;
struct GemmConfigBase
{
static constexpr bool kPadM = true;
static constexpr bool kPadN = true;
static constexpr bool kPadK = true;
static constexpr bool PermuteA = false;
static constexpr bool PermuteB = false;
static constexpr bool TransposeC = false;
static constexpr bool UseStructuredSparsity = false;
static constexpr int kBlockPerCu = 1;
static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
static constexpr ck_tile::index_t TileParitionerM01 = 4;
static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave;
static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
static constexpr ck_tile::index_t NumWaveGroups = 1;
static constexpr bool Preshuffle = false;
static constexpr bool TiledMMAPermuteN = false;
};
template <ck_tile::index_t PipelineId>
struct PipelineTypeTraits;
template <>
struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
{
template <typename PipelineProblem>
using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
template <typename PipelineProblem>
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
};
template <>
struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
{
template <typename PipelineProblem>
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
template <typename PipelineProblem>
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
};
template <>
struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
{
template <typename PipelineProblem>
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
template <typename PipelineProblem>
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
};
template <>
struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
{
template <typename PipelineProblem>
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
template <typename PipelineProblem>
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
};
} // namespace ops
} // namespace ck_tile

View File

@@ -11,32 +11,29 @@ namespace ops {
using BF16 = ck_tile::bfloat16_t;
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
template <ck_tile::index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename ELayout>
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
using tile_grouped_conv_bwd_data_bf16_instances = std::tuple<
// clang-format off
// clang-format off
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>
// clang-format on
>;
// clang-format on
>;
} // namespace ops
} // namespace ck_tile

View File

@@ -11,58 +11,55 @@ namespace ops {
using BF16 = ck_tile::bfloat16_t;
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
template <ck_tile::index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename ELayout>
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
using tile_grouped_conv_bwd_data_bf16_instances_2 = std::tuple<
// clang-format off
// clang-format off
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>, // prob this
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>, // prob this
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;
} // namespace ops
} // namespace ck_tile

View File

@@ -11,61 +11,58 @@ namespace ops {
using BF16 = ck_tile::bfloat16_t;
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
template <ck_tile::index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename ELayout>
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
using tile_grouped_conv_bwd_data_bf16_instances_3 = std::tuple<
// clang-format off
// clang-format off
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 1, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>, // ta
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>
// clang-format on
>;
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 1, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>, // ta
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>
// clang-format on
>;
} // namespace ops
} // namespace ck_tile

View File

@@ -11,58 +11,55 @@ namespace ops {
using BF16 = ck_tile::bfloat16_t;
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
template <ck_tile::index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename ELayout>
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
using tile_grouped_conv_bwd_data_bf16_instances_4 = std::tuple<
// clang-format off
// clang-format off
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,32, 32, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,32, 32, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;
} // namespace ops
} // namespace ck_tile

View File

@@ -11,62 +11,59 @@ namespace ops {
using BF16 = ck_tile::bfloat16_t;
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
template <ck_tile::index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename ELayout>
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
using tile_grouped_conv_bwd_data_bf16_instances_5 = std::tuple<
// clang-format off
// clang-format off
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 1, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>, // ta
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 1, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>, // ta
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;
} // namespace ops
} // namespace ck_tile

View File

@@ -11,60 +11,57 @@ namespace ops {
using BF16 = ck_tile::bfloat16_t;
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
template <ck_tile::index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename ELayout>
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
using tile_grouped_conv_bwd_data_bf16_instances_6 = std::tuple<
// clang-format off
// clang-format off
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,32, 32, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 8,4, false, CK_TILE_PIPELINE_MEMORY>
// clang-format on
>;
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,32, 32, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 8,4, false, CK_TILE_PIPELINE_MEMORY>
// clang-format on
>;
} // namespace ops
} // namespace ck_tile

View File

@@ -11,64 +11,61 @@ namespace ops {
using BF16 = ck_tile::bfloat16_t;
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
NHWGC,
GKYXC,
NHWGK,
BF16,
BF16,
BF16,
PassThrough,
PassThrough,
PassThrough,
BF16,
BF16>;
template <ck_tile::index_t NDimSpatial,
typename ALayout,
typename BLayout,
typename ELayout>
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
using tile_grouped_conv_bwd_data_bf16_instances_7 = std::tuple<
// clang-format off
// clang-format off
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>, // prob this
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>, // prob this
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>
// clang-format on
>;
} // namespace ops
} // namespace ck_tile

View File

@@ -11,7 +11,7 @@
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/epilogue.hpp"
#include "ck_tile/ops/gemm.hpp"
#include "ck_tile/ops/grouped_convolution.hpp"
#include "ck_tile/ops/grouped_convolution.hpp"
#include "ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp"
namespace ck_tile {
@@ -31,192 +31,209 @@ template <ck_tile::index_t NDimSpatial,
typename ComputeTypeB = ComputeTypeA>
struct GroupedConvolutionBackwardDataBaseInvoker
{
virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0;
virtual float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
virtual std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0;
virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0;
virtual float Run(const ck_tile::GroupedConvBwdDataHostArgs& args,
bool time_kernel,
int n_warmup,
int n_repeat) const = 0;
virtual std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0;
GroupedConvolutionBackwardDataBaseInvoker() = default;
GroupedConvolutionBackwardDataBaseInvoker(const GroupedConvolutionBackwardDataBaseInvoker&) = default;
GroupedConvolutionBackwardDataBaseInvoker& operator=(const GroupedConvolutionBackwardDataBaseInvoker&) = default;
GroupedConvolutionBackwardDataBaseInvoker(GroupedConvolutionBackwardDataBaseInvoker&&) = default;
GroupedConvolutionBackwardDataBaseInvoker& operator=(GroupedConvolutionBackwardDataBaseInvoker&&) = default;
virtual ~GroupedConvolutionBackwardDataBaseInvoker() = default;
GroupedConvolutionBackwardDataBaseInvoker(const GroupedConvolutionBackwardDataBaseInvoker&) =
default;
GroupedConvolutionBackwardDataBaseInvoker&
operator=(const GroupedConvolutionBackwardDataBaseInvoker&) = default;
GroupedConvolutionBackwardDataBaseInvoker(GroupedConvolutionBackwardDataBaseInvoker&&) =
default;
GroupedConvolutionBackwardDataBaseInvoker&
operator=(GroupedConvolutionBackwardDataBaseInvoker&&) = default;
virtual ~GroupedConvolutionBackwardDataBaseInvoker() = default;
};
template <
ck_tile::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck_tile::ConvolutionSpecialization ConvSpec,
int kBlockPerCu,
ck_tile::index_t M_Tile,
ck_tile::index_t N_Tile,
ck_tile::index_t K_Tile,
ck_tile::index_t M_Warp,
ck_tile::index_t N_Warp,
ck_tile::index_t K_Warp,
ck_tile::index_t M_Warp_Tile,
ck_tile::index_t N_Warp_Tile,
ck_tile::index_t K_Warp_Tile,
ck_tile::index_t VectorSizeA,
ck_tile::index_t VectorSizeB,
ck_tile::index_t VectorSizeC,
bool DoubleSmemBuffer,
ck_tile::index_t PipelineVersion>
struct GroupedConvolutionBackwardDataInvoker :
public GroupedConvolutionBackwardDataBaseInvoker<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
template <ck_tile::index_t NDimSpatial,
typename InLayout,
typename WeiLayout,
typename OutLayout,
typename InDataType,
typename WeiDataType,
typename OutDataType,
typename InElementwiseOperation,
typename WeiElementwiseOperation,
typename OutElementwiseOperation,
ck_tile::ConvolutionSpecialization ConvSpec,
int kBlockPerCu,
ck_tile::index_t M_Tile,
ck_tile::index_t N_Tile,
ck_tile::index_t K_Tile,
ck_tile::index_t M_Warp,
ck_tile::index_t N_Warp,
ck_tile::index_t K_Warp,
ck_tile::index_t M_Warp_Tile,
ck_tile::index_t N_Warp_Tile,
ck_tile::index_t K_Warp_Tile,
ck_tile::index_t VectorSizeA,
ck_tile::index_t VectorSizeB,
ck_tile::index_t VectorSizeC,
bool DoubleSmemBuffer,
ck_tile::index_t PipelineVersion>
struct GroupedConvolutionBackwardDataInvoker
: public GroupedConvolutionBackwardDataBaseInvoker<NDimSpatial,
InLayout,
WeiLayout,
OutLayout,
InDataType,
WeiDataType,
OutDataType,
InElementwiseOperation,
WeiElementwiseOperation,
OutElementwiseOperation>
{
using GemmShape = ck_tile::TileGemmShape<
ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>,
GemmConfigBase::PermuteA,
GemmConfigBase::PermuteB>;
using GemmShape =
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>,
GemmConfigBase::PermuteA,
GemmConfigBase::PermuteB>;
//static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
// static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
using TilePartitioner =
ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
GemmConfigBase::TileParitionerGroupNum,
GemmConfigBase::TileParitionerM01>;
ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
GemmConfigBase::TileParitionerGroupNum,
GemmConfigBase::TileParitionerM01>;
using GroupedConvTraitsType = ck_tile::GroupedConvTraits<NDimSpatial,
ConvSpec,
InLayout,
WeiLayout,
ck_tile::tuple<>, // = DsLayout
OutLayout,
VectorSizeA,
VectorSizeB,
VectorSizeC>;
ConvSpec,
InLayout,
WeiLayout,
ck_tile::tuple<>, // = DsLayout
OutLayout,
VectorSizeA,
VectorSizeB,
VectorSizeC>;
using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
GemmConfigBase::kPadM,
GemmConfigBase::kPadN,
GemmConfigBase::kPadK,
DoubleSmemBuffer,
typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::AsLayout,
typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::BsLayout,
typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::CLayout,
GemmConfigBase::TransposeC,
GemmConfigBase::UseStructuredSparsity,
false, // Persistent,
GemmConfigBase::NumWaveGroups>;
GemmConfigBase::kPadM,
GemmConfigBase::kPadN,
GemmConfigBase::kPadK,
DoubleSmemBuffer,
typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::AsLayout,
typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::BsLayout,
typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::CLayout,
GemmConfigBase::TransposeC,
GemmConfigBase::UseStructuredSparsity,
false, // Persistent,
GemmConfigBase::NumWaveGroups>;
using AccDataType = float;
using AccDataType = float;
using GemmPipelineProblem = ck_tile::GemmPipelineProblem<
OutDataType,
WeiDataType,
AccDataType,
GemmShape,
typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData,
ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough,
InDataType,
true,
VectorSizeA,
VectorSizeB>;
OutDataType,
WeiDataType,
AccDataType,
GemmShape,
typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>,
ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough,
InDataType,
true,
VectorSizeA,
VectorSizeB>;
using BaseGemmPipeline = typename PipelineTypeTraits<
PipelineVersion>::template UniversalGemmPipeline<GemmPipelineProblem>;
using BaseGemmPipeline = typename PipelineTypeTraits<PipelineVersion>::template UniversalGemmPipeline<GemmPipelineProblem>;
template <bool HasHotLoop, ck_tile::TailNumber TailNumber, ck_tile::memory_operation_enum MemOp>
auto CreateKernel() const
{
constexpr auto scheduler = GemmConfigBase::Scheduler;
using UniversalGemmProblem =
ck_tile::UniversalGemmPipelineProblem<OutDataType,
WeiDataType,
AccDataType,
GemmShape,
GemmUniversalTraits,
scheduler,
HasHotLoop,
TailNumber,
ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough,
InDataType,
true,
VectorSizeA,
VectorSizeB>;
WeiDataType,
AccDataType,
GemmShape,
GemmUniversalTraits,
scheduler,
// HasHotLoop,
// TailNumber,
ck_tile::element_wise::PassThrough,
ck_tile::element_wise::PassThrough,
InDataType,
true,
VectorSizeA,
VectorSizeB>;
using GemmPipeline = typename PipelineTypeTraits<PipelineVersion>::template GemmPipeline<UniversalGemmProblem>;
using GemmPipeline = typename PipelineTypeTraits<PipelineVersion>::template GemmPipeline<
UniversalGemmProblem>;
using CDEElementWise = ck_tile::element_wise::PassThrough;
using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
OutDataType,
WeiDataType,
ck_tile::tuple<>, // = DsDataType
AccDataType,
InDataType,
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
ck_tile::tensor_layout::gemm::RowMajor,
CDEElementWise,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
M_Warp,
N_Warp,
M_Warp_Tile,
N_Warp_Tile,
K_Warp_Tile,
GemmConfigBase::TransposeC,
MemOp,
1,
true,
GroupedConvTraitsType::VectorSizeC>>;
using ConvEpilogue = ck_tile::CShuffleEpilogue<
ck_tile::CShuffleEpilogueProblem<OutDataType,
WeiDataType,
ck_tile::tuple<>, // = DsDataType
AccDataType,
InDataType,
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
ck_tile::tensor_layout::gemm::RowMajor,
CDEElementWise,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
M_Warp,
N_Warp,
M_Warp_Tile,
N_Warp_Tile,
K_Warp_Tile,
GemmConfigBase::TransposeC,
MemOp,
1,
true,
GroupedConvTraitsType::VectorSizeC>>;
return ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
TilePartitioner,
GemmPipeline,
ConvEpilogue>{};
TilePartitioner,
GemmPipeline,
ConvEpilogue>{};
}
bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const override
{
if (args.k_batch > 1)
if(args.k_batch > 1)
{
using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::atomic_add>());
return Kernel::IsSupportedArgument(args);
using Kernel = decltype(CreateKernel<false,
ck_tile::TailNumber::Empty,
ck_tile::memory_operation_enum::atomic_add>());
return Kernel::IsSupportedArgument(args);
}
using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::set>());
using Kernel = decltype(CreateKernel<false,
ck_tile::TailNumber::Empty,
ck_tile::memory_operation_enum::set>());
return Kernel::IsSupportedArgument(args);
};
float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
float Run(const ck_tile::GroupedConvBwdDataHostArgs& args,
bool time_kernel,
int n_warmup = 5,
int n_repeat = 50) const override
{
[[maybe_unused]] ck_tile::index_t KGroups = 1;
for(int i=0; i<args.num_dim_spatial_;i++) {
KGroups *= args.filter_spatial_lengths_[i];//std::min(args.filter_spatial_lengths_[i], args.conv_filter_strides_[i]);
[[maybe_unused]] ck_tile::index_t KGroups = 1;
for(int i = 0; i < args.num_dim_spatial_; i++)
{
KGroups *= args.filter_spatial_lengths_[i]; // std::min(args.filter_spatial_lengths_[i],
// args.conv_filter_strides_[i]);
}
const index_t ConvStrideH = args.conv_filter_strides_[0];
const index_t ConvStrideW = args.conv_filter_strides_[1];
const index_t ConvDilationH = args.conv_filter_dilations_[0];
const index_t ConvDilationW = args.conv_filter_dilations_[1];
const auto GcdStrideDilationH = gcd(ConvStrideH, ConvDilationH);
const auto GcdStrideDilationW = gcd(ConvStrideW, ConvDilationW);
const auto YTilde = ConvStrideH / GcdStrideDilationH;
const auto XTilde = ConvStrideW / GcdStrideDilationW;
const auto Y = args.filter_spatial_lengths_[0];
const auto X = args.filter_spatial_lengths_[1];
[[maybe_unused]] const auto YDot = integer_divide_ceil(Y, YTilde);
[[maybe_unused]] const auto XDot = integer_divide_ceil(X, XTilde);
const index_t ConvStrideH = args.conv_filter_strides_[0];
const index_t ConvStrideW = args.conv_filter_strides_[1];
const index_t ConvDilationH = args.conv_filter_dilations_[0];
const index_t ConvDilationW = args.conv_filter_dilations_[1];
const auto GcdStrideDilationH = gcd(ConvStrideH, ConvDilationH);
const auto GcdStrideDilationW = gcd(ConvStrideW, ConvDilationW);
const auto YTilde = ConvStrideH / GcdStrideDilationH;
const auto XTilde = ConvStrideW / GcdStrideDilationW;
const auto Y = args.filter_spatial_lengths_[0];
const auto X = args.filter_spatial_lengths_[1];
[[maybe_unused]] const auto YDot = integer_divide_ceil(Y, YTilde);
[[maybe_unused]] const auto XDot = integer_divide_ceil(X, XTilde);
const ck_tile::index_t gemm_k = args.K_ * XDot * YDot;
@@ -229,29 +246,28 @@ struct GroupedConvolutionBackwardDataInvoker :
printf("gemm_k: %d num_loop: %d, Xdot: %ld YDot: %ld\n", gemm_k, num_loop, XDot, YDot);
const auto Run = [&](const auto has_hot_loop_,
const auto tail_number_,
const auto memory_operation_) {
constexpr bool has_hot_loop_v = has_hot_loop_.value;
constexpr auto tail_number_v = tail_number_.value;
constexpr auto memory_operation = memory_operation_.value;
const auto Run =
[&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
constexpr bool has_hot_loop_v = has_hot_loop_.value;
constexpr auto tail_number_v = tail_number_.value;
constexpr auto memory_operation = memory_operation_.value;
auto kernel = CreateKernel<has_hot_loop_v, tail_number_v, memory_operation>();
using Kernel = decltype(kernel);
auto kargs = Kernel::MakeKernelArgs(args);
const dim3 grids = Kernel::GridSize(args);
const dim3 blocks = Kernel::BlockSize();
auto kernel = CreateKernel<has_hot_loop_v, tail_number_v, memory_operation>();
using Kernel = decltype(kernel);
printf("grid: (%u, %u, %u)\n", grids.x, grids.y, grids.z);
ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
auto kargs = Kernel::MakeKernelArgs(args);
const dim3 grids = Kernel::GridSize(args);
const dim3 blocks = Kernel::BlockSize();
ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<kBlockPerCu>(kernel, grids, blocks, 0, kargs));
printf("grid: (%u, %u, %u)\n", grids.x, grids.y, grids.z);
return ave_time;
};
ck_tile::stream_config s{nullptr, time_kernel, 1, n_warmup, n_repeat};
ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<kBlockPerCu>(kernel, grids, blocks, 0, kargs));
return ave_time;
};
const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
if(args.k_batch == 1)
@@ -272,22 +288,28 @@ struct GroupedConvolutionBackwardDataInvoker :
{
std::stringstream min_occupancy;
min_occupancy << "_blk_per_cu_" << kBlockPerCu;
if (args.k_batch > 1)
if(args.k_batch > 1)
{
using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::atomic_add>());
return Kernel::GetName() + min_occupancy.str();
using Kernel = decltype(CreateKernel<false,
ck_tile::TailNumber::Empty,
ck_tile::memory_operation_enum::atomic_add>());
return Kernel::GetName() + min_occupancy.str();
}
using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::set>());
using Kernel = decltype(CreateKernel<false,
ck_tile::TailNumber::Empty,
ck_tile::memory_operation_enum::set>());
return Kernel::GetName() + min_occupancy.str();
};
GroupedConvolutionBackwardDataInvoker() = default;
GroupedConvolutionBackwardDataInvoker() = default;
GroupedConvolutionBackwardDataInvoker(const GroupedConvolutionBackwardDataInvoker&) = default;
GroupedConvolutionBackwardDataInvoker& operator=(const GroupedConvolutionBackwardDataInvoker&) = default;
GroupedConvolutionBackwardDataInvoker&
operator=(const GroupedConvolutionBackwardDataInvoker&) = default;
GroupedConvolutionBackwardDataInvoker(GroupedConvolutionBackwardDataInvoker&&) = default;
GroupedConvolutionBackwardDataInvoker& operator=(GroupedConvolutionBackwardDataInvoker&&) = default;
~GroupedConvolutionBackwardDataInvoker() override = default;
};
GroupedConvolutionBackwardDataInvoker&
operator=(GroupedConvolutionBackwardDataInvoker&&) = default;
~GroupedConvolutionBackwardDataInvoker() override = default;
};
}
}
} // namespace ops
} // namespace ck_tile

View File

@@ -6,4 +6,3 @@ include_directories(BEFORE
)
add_subdirectory(src)
add_subdirectory(ck_tile)