mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-07-01 04:07:56 +00:00
moved ck tile profiler to experimental
This commit is contained in:
@@ -41,6 +41,7 @@ include(CTest)
|
||||
option(ENABLE_CLANG_CPP_CHECKS "Enables clang tidy, cppcheck" ON)
|
||||
option(MIOPEN_REQ_LIBS_ONLY "Build only the MIOpen required libraries" OFF)
|
||||
option(CK_EXPERIMENTAL_BUILDER "Enable experimental builder" OFF)
|
||||
option(CK_EXPERIMENTAL_PROFILER "Enable experimental profiler" ON)
|
||||
option(BUILD_MHA_LIB "Build the static library for flash attention" OFF)
|
||||
option(FORCE_DISABLE_XDL "Skip compiling XDL specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)
|
||||
option(FORCE_DISABLE_WMMA "Skip compiling WMMA specific instances (even if supported GPUs are included in GPU_TARGETS)" OFF)
|
||||
@@ -50,6 +51,12 @@ if(CK_EXPERIMENTAL_BUILDER)
|
||||
include_directories(${PROJECT_SOURCE_DIR}/experimental/builder/include)
|
||||
endif()
|
||||
|
||||
if(CK_EXPERIMENTAL_PROFILER)
|
||||
add_definitions(-DCK_EXPERIMENTAL_PROFILEr)
|
||||
include_directories(${PROJECT_SOURCE_DIR}/experimental/ck_tile_profiler/include)
|
||||
endif()
|
||||
|
||||
|
||||
# Usage: for customized Python location cmake -DCK_USE_ALTERNATIVE_PYTHON="/opt/Python-3.8.13/bin/python3.8"
|
||||
# CK Codegen requires dataclass which is added in Python 3.7
|
||||
# Python version 3.8 is required for general good practice as it is default for Ubuntu 20.04
|
||||
@@ -729,6 +736,10 @@ if (CK_EXPERIMENTAL_BUILDER)
|
||||
add_subdirectory(experimental/builder)
|
||||
endif()
|
||||
|
||||
if (CK_EXPERIMENTAL_PROFILER)
|
||||
add_subdirectory(experimental/ck_tile_profiler)
|
||||
endif()
|
||||
|
||||
if(CK_USE_CODEGEN AND (SUPPORTED_GPU_TARGETS MATCHES "gfx9" OR GPU_ARCHS))
|
||||
add_subdirectory(codegen)
|
||||
endif()
|
||||
@@ -768,6 +779,13 @@ if(CK_EXPERIMENTAL_BUILDER)
|
||||
)
|
||||
endif()
|
||||
|
||||
if(CK_EXPERIMENTAL_PROFILER)
|
||||
rocm_install(DIRECTORY
|
||||
${PROJECT_SOURCE_DIR}/ck_tile_profiler
|
||||
DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/ck_tile
|
||||
)
|
||||
endif()
|
||||
|
||||
set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE")
|
||||
set(CPACK_RPM_PACKAGE_LICENSE "MIT")
|
||||
|
||||
|
||||
@@ -2,4 +2,4 @@ include_directories(BEFORE
|
||||
${CMAKE_CURRENT_LIST_DIR}/include
|
||||
)
|
||||
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(src)
|
||||
@@ -52,12 +52,12 @@ template <ck_tile::index_t NDimSpatial,
|
||||
typename ComputeTypeA = InDataType,
|
||||
typename ComputeTypeB = ComputeTypeA>
|
||||
bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
int init_method,
|
||||
bool /*do_log*/,
|
||||
bool time_kernel,
|
||||
const ck_tile::conv::ConvParam& conv_param,
|
||||
const std::string& split_k,
|
||||
ck_tile::index_t instance_index = -1)
|
||||
int init_method,
|
||||
bool /*do_log*/,
|
||||
bool time_kernel,
|
||||
const ck_tile::conv::ConvParam& conv_param,
|
||||
const std::string& split_k,
|
||||
ck_tile::index_t instance_index = -1)
|
||||
{
|
||||
using AccDataType = float;
|
||||
using InElementOp = ck_tile::element_wise::PassThrough;
|
||||
@@ -93,24 +93,21 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
ck_tile::FillUniformDistribution<WeiDataType>{-1.f, 1.f}(weight);
|
||||
ck_tile::FillUniformDistribution<OutDataType>{-1.f, 1.f}(output);
|
||||
break;
|
||||
default:
|
||||
weight.SetZero();
|
||||
output.SetZero();
|
||||
default: weight.SetZero(); output.SetZero();
|
||||
}
|
||||
|
||||
using DeviceOp = ops::GroupedConvolutionBackwardDataBaseInvoker<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>;
|
||||
using DeviceOp = ops::GroupedConvolutionBackwardDataBaseInvoker<NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>;
|
||||
|
||||
// get device op instances
|
||||
const auto ops = ck_tile::ops::DeviceOperationInstanceFactory<DeviceOp>::GetInstances();
|
||||
@@ -123,14 +120,15 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
float best_gb_per_sec = 0;
|
||||
std::string best_split_k("1");
|
||||
|
||||
// std::vector<ck_tile::index_t> split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304};
|
||||
// std::vector<ck_tile::index_t> split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76,
|
||||
// 128, 152, 256, 304};
|
||||
std::vector<ck_tile::index_t> split_k_list = {1, 2, 3, 4, 6, 8, 12, 16};
|
||||
if(split_k != "all")
|
||||
{
|
||||
try
|
||||
{
|
||||
ck_tile::index_t split_k_value = std::stoi(split_k);
|
||||
split_k_list = {split_k_value};
|
||||
split_k_list = {split_k_value};
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
@@ -142,21 +140,21 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
// First, calculate the reference result if verification is needed.
|
||||
ck_tile::HostTensor<InDataType> input_host_ref(in_g_n_c_wis_desc);
|
||||
input_host_ref.SetZero();
|
||||
if (do_verification)
|
||||
if(do_verification)
|
||||
{
|
||||
ck_tile::reference_grouped_conv_bwd_data<NDimSpatial, InDataType, WeiDataType, OutDataType>(
|
||||
input_host_ref,
|
||||
weight,
|
||||
output,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_);
|
||||
input_host_ref,
|
||||
weight,
|
||||
output,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_);
|
||||
}
|
||||
|
||||
//instance_index = 0;
|
||||
// instance_index = 0;
|
||||
index_t num_kernel = 0;
|
||||
bool all_pass = true;
|
||||
bool all_pass = true;
|
||||
for(auto& op : ops)
|
||||
{
|
||||
for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
|
||||
@@ -173,13 +171,13 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
output_dev_buf.ToDevice(output.data());
|
||||
|
||||
ck_tile::GroupedConvBwdDataHostArgs args(conv_param,
|
||||
input_dev_buf.GetDeviceBuffer(),
|
||||
weight_dev_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
output_dev_buf.GetDeviceBuffer(),
|
||||
split_k_value);
|
||||
input_dev_buf.GetDeviceBuffer(),
|
||||
weight_dev_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
output_dev_buf.GetDeviceBuffer(),
|
||||
split_k_value);
|
||||
|
||||
// Split-K autodeduction is not supported.
|
||||
// Split-K autodeduction is not supported.
|
||||
if(op->IsSupportedArgument(args) && split_k_value >= 1)
|
||||
{
|
||||
num_kernel++;
|
||||
@@ -190,10 +188,12 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
}
|
||||
|
||||
std::string op_name = op->GetName(args);
|
||||
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;
|
||||
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..."
|
||||
<< std::endl;
|
||||
|
||||
// Run verification first. If it doesn't pass, no need to do performance measurement.
|
||||
bool pass = false;
|
||||
// Run verification first. If it doesn't pass, no need to do performance
|
||||
// measurement.
|
||||
bool pass = false;
|
||||
if(do_verification)
|
||||
{
|
||||
constexpr int n_warmup = 0;
|
||||
@@ -202,8 +202,9 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
op->Run(args, false, n_warmup, n_repeat);
|
||||
input_dev_buf.FromDevice(input.data());
|
||||
|
||||
const ck_tile::index_t GemmK = conv_param.K_
|
||||
* conv_param.filter_spatial_lengths_[0] * conv_param.filter_spatial_lengths_[1];
|
||||
const ck_tile::index_t GemmK = conv_param.K_ *
|
||||
conv_param.filter_spatial_lengths_[0] *
|
||||
conv_param.filter_spatial_lengths_[1];
|
||||
|
||||
const float max_accumulated_value =
|
||||
*std::max_element(input_host_ref.mData.begin(), input_host_ref.mData.end());
|
||||
@@ -212,37 +213,39 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
GemmK, split_k_value, max_accumulated_value);
|
||||
|
||||
pass = ck_tile::check_err(input,
|
||||
input_host_ref,
|
||||
"Error: Incorrect results!",
|
||||
rtol_atol.at(ck_tile::number<0>{}),
|
||||
rtol_atol.at(ck_tile::number<1>{}));
|
||||
input_host_ref,
|
||||
"Error: Incorrect results!",
|
||||
rtol_atol.at(ck_tile::number<0>{}),
|
||||
rtol_atol.at(ck_tile::number<1>{}));
|
||||
|
||||
std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
|
||||
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
|
||||
<< std::endl;
|
||||
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
|
||||
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
|
||||
<< std::endl;
|
||||
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail")
|
||||
<< std::endl;
|
||||
|
||||
all_pass &= pass;
|
||||
}
|
||||
|
||||
bool is_valid = do_verification ? pass : true;
|
||||
|
||||
if (is_valid)
|
||||
if(is_valid)
|
||||
{
|
||||
constexpr int n_warmup = 5;
|
||||
constexpr int n_repeat = 50;
|
||||
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
|
||||
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype =
|
||||
conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
|
||||
<< split_k_param_str << std::endl;
|
||||
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
|
||||
<< split_k_param_str << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_name = op_name;
|
||||
@@ -253,7 +256,7 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else
|
||||
{
|
||||
std::cout << op->GetName(args) << ", SplitK " << split_k_param_str
|
||||
<< " does not support this problem." << std::endl;
|
||||
@@ -262,14 +265,11 @@ bool profile_grouped_conv_bwd_data_impl(int do_verification,
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "\n********************************"
|
||||
<< "\nCK Tile best configuration parameters:"
|
||||
<< "\n********************************"
|
||||
<< "\nname: " << best_op_name
|
||||
<< "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
|
||||
<< "\nGB/s: " << best_gb_per_sec
|
||||
<< "\nSplitK: " << best_split_k
|
||||
<< std::endl;
|
||||
ss << "\n********************************"
|
||||
<< "\nCK Tile best configuration parameters:" << "\n********************************"
|
||||
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
|
||||
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec
|
||||
<< "\nSplitK: " << best_split_k << std::endl;
|
||||
|
||||
std::cout << ss.str();
|
||||
|
||||
@@ -93,24 +93,21 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
ck_tile::FillUniformDistribution<InDataType>{0.f, 1.f}(input);
|
||||
ck_tile::FillUniformDistribution<OutDataType>{0.f, 1.f}(output);
|
||||
break;
|
||||
default:
|
||||
input.SetZero();
|
||||
output.SetZero();
|
||||
default: input.SetZero(); output.SetZero();
|
||||
}
|
||||
|
||||
using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>;
|
||||
using DeviceOp = ops::GroupedConvolutionBackwardWeightBaseInvoker<NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>;
|
||||
|
||||
// get device op instances
|
||||
const auto ops = ck_tile::ops::DeviceOperationInstanceFactory<DeviceOp>::GetInstances();
|
||||
@@ -123,13 +120,14 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
float best_gb_per_sec = 0;
|
||||
std::string best_split_k("1");
|
||||
|
||||
std::vector<ck_tile::index_t> split_k_list = {1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304};
|
||||
std::vector<ck_tile::index_t> split_k_list = {
|
||||
1, 2, 4, 6, 8, 10, 12, 16, 19, 32, 38, 64, 76, 128, 152, 256, 304};
|
||||
if(split_k != "all")
|
||||
{
|
||||
try
|
||||
{
|
||||
ck_tile::index_t split_k_value = std::stoi(split_k);
|
||||
split_k_list = {split_k_value};
|
||||
split_k_list = {split_k_value};
|
||||
}
|
||||
catch(const std::exception& e)
|
||||
{
|
||||
@@ -141,21 +139,21 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
// First, calculate the reference result if verification is needed.
|
||||
ck_tile::HostTensor<WeiDataType> weight_host_ref(wei_g_k_c_xs_desc);
|
||||
weight_host_ref.SetZero();
|
||||
if (do_verification)
|
||||
if(do_verification)
|
||||
{
|
||||
ck_tile::reference_grouped_conv_bwd_weight<NDimSpatial, InDataType, WeiDataType, OutDataType>(
|
||||
input,
|
||||
weight_host_ref,
|
||||
output,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_);
|
||||
ck_tile::
|
||||
reference_grouped_conv_bwd_weight<NDimSpatial, InDataType, WeiDataType, OutDataType>(
|
||||
input,
|
||||
weight_host_ref,
|
||||
output,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_);
|
||||
}
|
||||
|
||||
|
||||
index_t num_kernel = 0;
|
||||
bool all_pass = true;
|
||||
bool all_pass = true;
|
||||
for(auto& op : ops)
|
||||
{
|
||||
for(std::size_t split_k_id = 0; split_k_id < split_k_list.size(); split_k_id++)
|
||||
@@ -172,13 +170,13 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
output_dev_buf.ToDevice(output.data());
|
||||
|
||||
ck_tile::GroupedConvBwdWeightHostArgs args(conv_param,
|
||||
input_dev_buf.GetDeviceBuffer(),
|
||||
weight_dev_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
output_dev_buf.GetDeviceBuffer(),
|
||||
split_k_value);
|
||||
input_dev_buf.GetDeviceBuffer(),
|
||||
weight_dev_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
output_dev_buf.GetDeviceBuffer(),
|
||||
split_k_value);
|
||||
|
||||
// Split-K autodeduction is not supported.
|
||||
// Split-K autodeduction is not supported.
|
||||
if(op->IsSupportedArgument(args) && split_k_value >= 1)
|
||||
{
|
||||
num_kernel++;
|
||||
@@ -189,10 +187,12 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
}
|
||||
|
||||
std::string op_name = op->GetName(args);
|
||||
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..." << std::endl;
|
||||
std::cout << op_name << ", SplitK " << split_k_param_str << " is profiled..."
|
||||
<< std::endl;
|
||||
|
||||
// Run verification first. If it doesn't pass, no need to do performance measurement.
|
||||
bool pass = false;
|
||||
// Run verification first. If it doesn't pass, no need to do performance
|
||||
// measurement.
|
||||
bool pass = false;
|
||||
if(do_verification)
|
||||
{
|
||||
constexpr int n_warmup = 0;
|
||||
@@ -201,45 +201,48 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
op->Run(args, false, n_warmup, n_repeat);
|
||||
weight_dev_buf.FromDevice(weight.data());
|
||||
|
||||
const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
|
||||
const float max_accumulated_value =
|
||||
*std::max_element(weight_host_ref.mData.begin(), weight_host_ref.mData.end());
|
||||
const ck_tile::index_t GemmK =
|
||||
weight.get_element_size() / (conv_param.G_ * conv_param.K_);
|
||||
const float max_accumulated_value = *std::max_element(
|
||||
weight_host_ref.mData.begin(), weight_host_ref.mData.end());
|
||||
const auto rtol_atol =
|
||||
calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
|
||||
GemmK, split_k_value, max_accumulated_value);
|
||||
|
||||
pass = ck_tile::check_err(weight,
|
||||
weight_host_ref,
|
||||
"Error: Incorrect results!",
|
||||
rtol_atol.at(ck_tile::number<0>{}),
|
||||
rtol_atol.at(ck_tile::number<1>{}));
|
||||
weight_host_ref,
|
||||
"Error: Incorrect results!",
|
||||
rtol_atol.at(ck_tile::number<0>{}),
|
||||
rtol_atol.at(ck_tile::number<1>{}));
|
||||
|
||||
std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
|
||||
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
|
||||
<< std::endl;
|
||||
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
|
||||
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
|
||||
<< std::endl;
|
||||
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail")
|
||||
<< std::endl;
|
||||
|
||||
all_pass &= pass;
|
||||
}
|
||||
|
||||
bool is_valid = do_verification ? pass : true;
|
||||
|
||||
if (is_valid)
|
||||
if(is_valid)
|
||||
{
|
||||
constexpr int n_warmup = 5;
|
||||
constexpr int n_repeat = 50;
|
||||
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
|
||||
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype =
|
||||
conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
|
||||
float tflops = static_cast<float>(flop) / 1.E9 / avg_time;
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
|
||||
<< split_k_param_str << std::endl;
|
||||
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << ", SplitK "
|
||||
<< split_k_param_str << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
best_op_name = op_name;
|
||||
@@ -250,7 +253,7 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else
|
||||
{
|
||||
std::cout << op->GetName(args) << ", SplitK " << split_k_param_str
|
||||
<< " does not support this problem." << std::endl;
|
||||
@@ -259,14 +262,11 @@ bool profile_grouped_conv_bwd_weight_impl(int do_verification,
|
||||
}
|
||||
|
||||
std::stringstream ss;
|
||||
ss << "\n********************************"
|
||||
<< "\nCK Tile best configuration parameters:"
|
||||
<< "\n********************************"
|
||||
<< "\nname: " << best_op_name
|
||||
<< "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
|
||||
<< "\nGB/s: " << best_gb_per_sec
|
||||
<< "\nSplitK: " << best_split_k
|
||||
<< std::endl;
|
||||
ss << "\n********************************"
|
||||
<< "\nCK Tile best configuration parameters:" << "\n********************************"
|
||||
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
|
||||
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec
|
||||
<< "\nSplitK: " << best_split_k << std::endl;
|
||||
|
||||
std::cout << ss.str();
|
||||
|
||||
@@ -52,12 +52,12 @@ template <ck_tile::index_t NDimSpatial,
|
||||
typename ComputeTypeA = InDataType,
|
||||
typename ComputeTypeB = ComputeTypeA>
|
||||
bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
int init_method,
|
||||
bool /*do_log*/,
|
||||
bool time_kernel,
|
||||
const ck_tile::conv::ConvParam& conv_param,
|
||||
const ck_tile::index_t k_batch,
|
||||
ck_tile::index_t instance_index = -1)
|
||||
int init_method,
|
||||
bool /*do_log*/,
|
||||
bool time_kernel,
|
||||
const ck_tile::conv::ConvParam& conv_param,
|
||||
const ck_tile::index_t k_batch,
|
||||
ck_tile::index_t instance_index = -1)
|
||||
{
|
||||
using AccDataType = float;
|
||||
using InElementOp = ck_tile::element_wise::PassThrough;
|
||||
@@ -100,19 +100,18 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
weight.SetZero();
|
||||
}
|
||||
|
||||
using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>;
|
||||
using DeviceOp = ops::GroupedConvolutionForwardBaseInvoker<NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>;
|
||||
|
||||
// get device op instances
|
||||
const auto ops = ck_tile::ops::DeviceOperationInstanceFactory<DeviceOp>::GetInstances();
|
||||
@@ -125,7 +124,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
float best_gb_per_sec = 0;
|
||||
|
||||
index_t num_kernel = 0;
|
||||
bool all_pass = true;
|
||||
bool all_pass = true;
|
||||
|
||||
// tmp enforce instance
|
||||
// instance_index = -1;
|
||||
@@ -141,12 +140,12 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
output_dev_buf.SetZero();
|
||||
|
||||
ck_tile::GroupedConvFwdHostArgs args(conv_param,
|
||||
input_dev_buf.GetDeviceBuffer(),
|
||||
weight_dev_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
output_dev_buf.GetDeviceBuffer(),
|
||||
k_batch);
|
||||
|
||||
input_dev_buf.GetDeviceBuffer(),
|
||||
weight_dev_buf.GetDeviceBuffer(),
|
||||
{},
|
||||
output_dev_buf.GetDeviceBuffer(),
|
||||
k_batch);
|
||||
|
||||
if(op->IsSupportedArgument(args))
|
||||
{
|
||||
num_kernel++;
|
||||
@@ -160,7 +159,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
std::cout << op_name << " is profiled..." << std::endl;
|
||||
|
||||
// Run verification first. If it doesn't pass, no need to do performance measurement.
|
||||
bool pass = false;
|
||||
bool pass = false;
|
||||
if(do_verification)
|
||||
{
|
||||
constexpr int n_warmup = 0;
|
||||
@@ -172,39 +171,42 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
|
||||
output_host_ref.SetZero();
|
||||
|
||||
ck_tile::reference_grouped_conv_fwd<NDimSpatial, InDataType, WeiDataType, OutDataType>(
|
||||
input,
|
||||
weight,
|
||||
output_host_ref,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_);
|
||||
const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
|
||||
ck_tile::
|
||||
reference_grouped_conv_fwd<NDimSpatial, InDataType, WeiDataType, OutDataType>(
|
||||
input,
|
||||
weight,
|
||||
output_host_ref,
|
||||
conv_param.conv_filter_strides_,
|
||||
conv_param.conv_filter_dilations_,
|
||||
conv_param.input_left_pads_,
|
||||
conv_param.input_right_pads_);
|
||||
const ck_tile::index_t GemmK =
|
||||
weight.get_element_size() / (conv_param.G_ * conv_param.K_);
|
||||
const float max_accumulated_value =
|
||||
*std::max_element(output_host_ref.mData.begin(), output_host_ref.mData.end());
|
||||
const auto rtol_atol =
|
||||
calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
|
||||
GemmK, k_batch, max_accumulated_value);
|
||||
pass = ck_tile::check_err(output,
|
||||
output_host_ref,
|
||||
"Error: Incorrect results!",
|
||||
rtol_atol.at(ck_tile::number<0>{}),
|
||||
rtol_atol.at(ck_tile::number<1>{}));
|
||||
output_host_ref,
|
||||
"Error: Incorrect results!",
|
||||
rtol_atol.at(ck_tile::number<0>{}),
|
||||
rtol_atol.at(ck_tile::number<1>{}));
|
||||
|
||||
std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
|
||||
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
|
||||
<< std::endl;
|
||||
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
|
||||
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
|
||||
<< std::endl;
|
||||
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail")
|
||||
<< std::endl;
|
||||
all_pass &= pass;
|
||||
}
|
||||
|
||||
bool is_valid = do_verification ? pass : true;
|
||||
if (is_valid)
|
||||
if(is_valid)
|
||||
{
|
||||
constexpr int n_warmup = 5;
|
||||
constexpr int n_repeat = 50;
|
||||
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
|
||||
float avg_time = op->Run(args, time_kernel, n_warmup, n_repeat);
|
||||
|
||||
std::size_t flop = conv_param.GetFlops();
|
||||
std::size_t num_btype = conv_param.GetByte<InDataType, WeiDataType, OutDataType>();
|
||||
@@ -213,7 +215,7 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
float gb_per_sec = num_btype / 1.E6 / avg_time;
|
||||
|
||||
std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
<< " TFlops, " << gb_per_sec << " GB/s, " << op_name << std::endl;
|
||||
|
||||
if(tflops > best_tflops)
|
||||
{
|
||||
@@ -224,18 +226,16 @@ bool profile_grouped_conv_fwd_impl(int do_verification,
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
else
|
||||
{
|
||||
//std::cout << op->GetName(args) << " does not support this problem." << std::endl;
|
||||
// std::cout << op->GetName(args) << " does not support this problem." << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
std::cout << "\n********************************"
|
||||
<< "\nBest configuration parameters:"
|
||||
<< "\n********************************"
|
||||
<< "\nname: " << best_op_name
|
||||
<< "\navg_time: " << best_avg_time << "\ntflops: " << best_tflops
|
||||
<< "\nGB/s: " << best_gb_per_sec << std::endl;
|
||||
std::cout << "\n********************************"
|
||||
<< "\nBest configuration parameters:" << "\n********************************"
|
||||
<< "\nname: " << best_op_name << "\navg_time: " << best_avg_time
|
||||
<< "\ntflops: " << best_tflops << "\nGB/s: " << best_gb_per_sec << std::endl;
|
||||
|
||||
const char* log_file = std::getenv("CK_TILE_PROFILER_LOG_FILE");
|
||||
if(log_file != nullptr)
|
||||
@@ -25,9 +25,9 @@ enum struct ConvLayout
|
||||
|
||||
enum struct ConvDataType
|
||||
{
|
||||
F32_F32_F32, // 0
|
||||
F16_F16_F16, // 1
|
||||
BF16_BF16_BF16, // 2
|
||||
F32_F32_F32, // 0
|
||||
F16_F16_F16, // 1
|
||||
BF16_BF16_BF16, // 2
|
||||
};
|
||||
|
||||
#define OP_NAME "grouped_conv_bwd_data"
|
||||
@@ -38,14 +38,14 @@ static void print_helper_msg()
|
||||
std::string conv_param_parser_helper_msg;
|
||||
|
||||
conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n"
|
||||
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
|
||||
" G, N, K, C, \n"
|
||||
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
" <strides>, (ie Sy, Sx for 2D)\n"
|
||||
" <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
" <right padding>, (ie RightPy, RightPx for 2D)\n";
|
||||
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
|
||||
" G, N, K, C, \n"
|
||||
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
" <strides>, (ie Sy, Sx for 2D)\n"
|
||||
" <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
" <right padding>, (ie RightPy, RightPx for 2D)\n";
|
||||
|
||||
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
|
||||
@@ -111,17 +111,17 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
|
||||
// using F8 = ck_tile::fp8_t;
|
||||
// using BF8 = ck_tile::bf8_t;
|
||||
|
||||
using NHWGC = ck_tile::tensor_layout::convolution::NHWGC;
|
||||
using NHWGC = ck_tile::tensor_layout::convolution::NHWGC;
|
||||
// using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
|
||||
|
||||
using GKYXC = ck_tile::tensor_layout::convolution::GKYXC;
|
||||
using GKYXC = ck_tile::tensor_layout::convolution::GKYXC;
|
||||
// using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
|
||||
|
||||
using NHWGK = ck_tile::tensor_layout::convolution::NHWGK;
|
||||
using NHWGK = ck_tile::tensor_layout::convolution::NHWGK;
|
||||
// using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
constexpr auto I2 = ck_tile::number<2>{};
|
||||
//constexpr auto I3 = ck_tile::number<3>{};
|
||||
// constexpr auto I3 = ck_tile::number<3>{};
|
||||
|
||||
auto profile = [&](auto num_dim_spatial_tmp,
|
||||
auto in_layout,
|
||||
@@ -146,14 +146,14 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
|
||||
using ComputeTypeB = decltype(compute_type_b);
|
||||
|
||||
bool pass = ck_tile::profiler::profile_grouped_conv_bwd_data_impl<NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>(
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>(
|
||||
do_verification, init_method, do_log, time_kernel, params, split_k);
|
||||
|
||||
return pass ? 0 : 1;
|
||||
@@ -179,7 +179,7 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
|
||||
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
|
||||
// {
|
||||
// if(data_type == ConvDataType::F32_F32_F32)
|
||||
@@ -193,7 +193,8 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
|
||||
// if(data_type == ConvDataType::BF16_F32_BF16)
|
||||
// {
|
||||
// // fp32 atomic add is used for weight tensor in bf16 kernel
|
||||
// return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{}, BF16{});
|
||||
// return profile(I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, BF16{}, F32{}, BF16{}, BF16{},
|
||||
// BF16{});
|
||||
// }
|
||||
// if(data_type == ConvDataType::BF16_BF16_BF16)
|
||||
// {
|
||||
@@ -207,7 +208,8 @@ int tile_profile_grouped_conv_bwd_data(int argc, char* argv[])
|
||||
// else if(data_type == ConvDataType::I8_I8_I8)
|
||||
// {
|
||||
// return profile(
|
||||
// I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{}, int8_t{});
|
||||
// I3, NDHWGC{}, GKZYXC{}, NDHWGK{}, int8_t{}, int8_t{}, int8_t{}, int8_t{},
|
||||
// int8_t{});
|
||||
// }
|
||||
// }
|
||||
|
||||
@@ -43,14 +43,14 @@ static void print_helper_msg()
|
||||
std::string conv_param_parser_helper_msg;
|
||||
|
||||
conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n"
|
||||
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
|
||||
" G, N, K, C, \n"
|
||||
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
" <strides>, (ie Sy, Sx for 2D)\n"
|
||||
" <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
" <right padding>, (ie RightPy, RightPx for 2D)\n";
|
||||
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
|
||||
" G, N, K, C, \n"
|
||||
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
" <strides>, (ie Sy, Sx for 2D)\n"
|
||||
" <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
" <right padding>, (ie RightPy, RightPx for 2D)\n";
|
||||
|
||||
std::cout << "arg1: tensor operation (" OP_NAME ": " OP_DESC ")\n"
|
||||
<< "arg2: data type (0: Input fp32, Weight fp32, Output fp32\n"
|
||||
@@ -151,14 +151,14 @@ int tile_profile_grouped_conv_bwd_weight(int argc, char* argv[])
|
||||
using ComputeTypeB = decltype(compute_type_b);
|
||||
|
||||
bool pass = ck_tile::profiler::profile_grouped_conv_bwd_weight_impl<NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>(
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>(
|
||||
do_verification, init_method, do_log, time_kernel, params, split_k);
|
||||
|
||||
return pass ? 0 : 1;
|
||||
@@ -184,7 +184,7 @@ int tile_profile_grouped_conv_bwd_weight(int argc, char* argv[])
|
||||
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
|
||||
{
|
||||
if(data_type == ConvDataType::F32_F32_F32)
|
||||
@@ -25,10 +25,10 @@ enum struct ConvLayout
|
||||
|
||||
enum struct ConvDataType
|
||||
{
|
||||
F32_F32_F32, // 0
|
||||
F16_F16_F16, // 1
|
||||
BF16_BF16_BF16, // 2
|
||||
I8_I8_I8, // 3
|
||||
F32_F32_F32, // 0
|
||||
F16_F16_F16, // 1
|
||||
BF16_BF16_BF16, // 2
|
||||
I8_I8_I8, // 3
|
||||
};
|
||||
|
||||
#define OP_NAME "grouped_conv_fwd"
|
||||
@@ -39,14 +39,14 @@ static void print_helper_msg()
|
||||
std::string conv_param_parser_helper_msg;
|
||||
|
||||
conv_param_parser_helper_msg += "Following arguments (depending on number of spatial dims):\n"
|
||||
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
|
||||
" G, N, K, C, \n"
|
||||
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
" <strides>, (ie Sy, Sx for 2D)\n"
|
||||
" <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
" <right padding>, (ie RightPy, RightPx for 2D)\n";
|
||||
" Number of spatial dimensions (1=Conv1d, 2=Conv2d, 3=Conv3d)\n"
|
||||
" G, N, K, C, \n"
|
||||
" <filter spatial dimensions>, (ie Y, X for 2D)\n"
|
||||
" <input image spatial dimensions>, (ie Hi, Wi for 2D)\n"
|
||||
" <strides>, (ie Sy, Sx for 2D)\n"
|
||||
" <dilations>, (ie Dy, Dx for 2D)\n"
|
||||
" <left padding>, (ie LeftPy, LeftPx for 2D)\n"
|
||||
" <right padding>, (ie RightPy, RightPx for 2D)\n";
|
||||
|
||||
std::cout
|
||||
// clang-format off
|
||||
@@ -101,7 +101,7 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
|
||||
return 1;
|
||||
}
|
||||
|
||||
const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv);
|
||||
const auto params = ck_tile::conv::parse_conv_param(num_dim_spatial, 9, argv);
|
||||
constexpr ck_tile::index_t k_batch = 1;
|
||||
|
||||
using F32 = float;
|
||||
@@ -143,14 +143,14 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
|
||||
using ComputeTypeB = decltype(compute_type_b);
|
||||
|
||||
bool pass = ck_tile::profiler::profile_grouped_conv_fwd_impl<NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>(
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
ComputeTypeA,
|
||||
ComputeTypeB>(
|
||||
do_verification, init_method, do_log, time_kernel, params, k_batch);
|
||||
|
||||
return pass ? 0 : 1;
|
||||
@@ -171,7 +171,7 @@ int tile_profile_grouped_conv_fwd(int argc, char* argv[])
|
||||
return profile(I2, NHWGC{}, GKYXC{}, NHWGK{}, BF16{}, BF16{}, BF16{}, BF16{}, BF16{});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if(num_dim_spatial == 3 && layout == ConvLayout::NHWGC_GKYXC_NHWGK)
|
||||
{
|
||||
if(data_type == ConvDataType::F32_F32_F32)
|
||||
@@ -5,6 +5,8 @@
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/convolution_parameter.hpp"
|
||||
#include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
|
||||
#include "ck_tile/ops/grouped_convolution/utils/convolution_specialization.hpp"
|
||||
#include "ck_tile/ops/elementwise/unary_element_wise_operation.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
|
||||
@@ -3,3 +3,4 @@
|
||||
|
||||
add_subdirectory(src/tensor_operation_instance/gpu)
|
||||
add_subdirectory(src/utility)
|
||||
add_subdirectory(src/ck_tile/tensor_operation_instance/gpu)
|
||||
|
||||
@@ -0,0 +1,90 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <variant>
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
|
||||
#define CK_TILE_PIPELINE_COMPUTE_V3 1
|
||||
#define CK_TILE_PIPELINE_MEMORY 2
|
||||
#define CK_TILE_PIPELINE_COMPUTE_V4 3
|
||||
#define CK_TILE_PIPELINE_COMPUTE_V5 4
|
||||
|
||||
namespace ck_tile {
|
||||
namespace ops {
|
||||
|
||||
using MemoryOpSet =
|
||||
std::integral_constant<ck_tile::memory_operation_enum, ck_tile::memory_operation_enum::set>;
|
||||
|
||||
using MemoryOpAtomicAdd = std::integral_constant<ck_tile::memory_operation_enum,
|
||||
ck_tile::memory_operation_enum::atomic_add>;
|
||||
|
||||
struct GemmConfigBase
|
||||
{
|
||||
static constexpr bool kPadM = true;
|
||||
static constexpr bool kPadN = true;
|
||||
static constexpr bool kPadK = true;
|
||||
|
||||
static constexpr bool PermuteA = false;
|
||||
static constexpr bool PermuteB = false;
|
||||
|
||||
static constexpr bool TransposeC = false;
|
||||
static constexpr bool UseStructuredSparsity = false;
|
||||
|
||||
static constexpr int kBlockPerCu = 1;
|
||||
static constexpr ck_tile::index_t TileParitionerGroupNum = 8;
|
||||
static constexpr ck_tile::index_t TileParitionerM01 = 4;
|
||||
static constexpr auto Scheduler = ck_tile::GemmPipelineScheduler::Intrawave;
|
||||
static constexpr ck_tile::index_t Pipeline = CK_TILE_PIPELINE_COMPUTE_V3;
|
||||
static constexpr ck_tile::index_t NumWaveGroups = 1;
|
||||
static constexpr bool Preshuffle = false;
|
||||
static constexpr bool TiledMMAPermuteN = false;
|
||||
};
|
||||
|
||||
template <ck_tile::index_t PipelineId>
|
||||
struct PipelineTypeTraits;
|
||||
|
||||
template <>
|
||||
struct PipelineTypeTraits<CK_TILE_PIPELINE_MEMORY>
|
||||
{
|
||||
template <typename PipelineProblem>
|
||||
using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<PipelineProblem>;
|
||||
template <typename PipelineProblem>
|
||||
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<PipelineProblem>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V3>
|
||||
{
|
||||
template <typename PipelineProblem>
|
||||
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV3<PipelineProblem>;
|
||||
template <typename PipelineProblem>
|
||||
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV3<PipelineProblem>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
{
|
||||
template <typename PipelineProblem>
|
||||
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV4<PipelineProblem>;
|
||||
template <typename PipelineProblem>
|
||||
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV4<PipelineProblem>;
|
||||
};
|
||||
|
||||
template <>
|
||||
struct PipelineTypeTraits<CK_TILE_PIPELINE_COMPUTE_V5>
|
||||
{
|
||||
template <typename PipelineProblem>
|
||||
using GemmPipeline = ck_tile::GemmPipelineAgBgCrCompV5<PipelineProblem>;
|
||||
template <typename PipelineProblem>
|
||||
using UniversalGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrCompV5<PipelineProblem>;
|
||||
};
|
||||
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
@@ -11,32 +11,29 @@ namespace ops {
|
||||
using BF16 = ck_tile::bfloat16_t;
|
||||
|
||||
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout>
|
||||
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
|
||||
using tile_grouped_conv_bwd_data_bf16_instances = std::tuple<
|
||||
// clang-format off
|
||||
// clang-format off
|
||||
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>
|
||||
// clang-format on
|
||||
>;
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -11,58 +11,55 @@ namespace ops {
|
||||
using BF16 = ck_tile::bfloat16_t;
|
||||
|
||||
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout>
|
||||
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
|
||||
using tile_grouped_conv_bwd_data_bf16_instances_2 = std::tuple<
|
||||
// clang-format off
|
||||
// clang-format off
|
||||
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>, // prob this
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
// clang-format on
|
||||
>;
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>, // prob this
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -11,61 +11,58 @@ namespace ops {
|
||||
using BF16 = ck_tile::bfloat16_t;
|
||||
|
||||
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout>
|
||||
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
|
||||
using tile_grouped_conv_bwd_data_bf16_instances_3 = std::tuple<
|
||||
// clang-format off
|
||||
// clang-format off
|
||||
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 1, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>, // ta
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>
|
||||
// clang-format on
|
||||
>;
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 1, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>, // ta
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -11,58 +11,55 @@ namespace ops {
|
||||
using BF16 = ck_tile::bfloat16_t;
|
||||
|
||||
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout>
|
||||
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
|
||||
using tile_grouped_conv_bwd_data_bf16_instances_4 = std::tuple<
|
||||
// clang-format off
|
||||
// clang-format off
|
||||
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,32, 32, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
// clang-format on
|
||||
>;
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,32, 32, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -11,62 +11,59 @@ namespace ops {
|
||||
using BF16 = ck_tile::bfloat16_t;
|
||||
|
||||
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout>
|
||||
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
|
||||
using tile_grouped_conv_bwd_data_bf16_instances_5 = std::tuple<
|
||||
// clang-format off
|
||||
// clang-format off
|
||||
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 1, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>, // ta
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
// clang-format on
|
||||
>;
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 1, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>, // ta
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 4,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 256, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 256, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 64, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -11,60 +11,57 @@ namespace ops {
|
||||
using BF16 = ck_tile::bfloat16_t;
|
||||
|
||||
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout>
|
||||
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
|
||||
using tile_grouped_conv_bwd_data_bf16_instances_6 = std::tuple<
|
||||
// clang-format off
|
||||
// clang-format off
|
||||
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,32, 32, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 8,4, false, CK_TILE_PIPELINE_MEMORY>
|
||||
// clang-format on
|
||||
>;
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 128, 32, 2, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 2, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 128, 32, 1, 2, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 32, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 32, 64, 32, 1, 1, 1,32, 32, 16, 8, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 8,8, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 8,8, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,32, 32, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 64, 4, 1, 1,32, 32, 16, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Filter1x1Stride1Pad0, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 8,4, false, CK_TILE_PIPELINE_MEMORY>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -11,64 +11,61 @@ namespace ops {
|
||||
using BF16 = ck_tile::bfloat16_t;
|
||||
|
||||
using DeviceOp2DBF16 = GroupedConvolutionBackwardDataBaseInvoker<2,
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
NHWGC,
|
||||
GKYXC,
|
||||
NHWGK,
|
||||
BF16,
|
||||
BF16,
|
||||
BF16,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
BF16,
|
||||
BF16>;
|
||||
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename ALayout,
|
||||
typename BLayout,
|
||||
typename ELayout>
|
||||
template <ck_tile::index_t NDimSpatial, typename ALayout, typename BLayout, typename ELayout>
|
||||
using tile_grouped_conv_bwd_data_bf16_instances_7 = std::tuple<
|
||||
// clang-format off
|
||||
// clang-format off
|
||||
//###################################| Num| InLayout| WeiLayout| OutLayout| InData|WeiData|OutData| In| Wei| Out| Conv|K-block| M-tile| N-tile | K-tile | M-warp| N-warp| K-warp| M-warp| N-warp| K-warp| Vector| Vector| Vector| Double| GEMM|
|
||||
//###################################| Dim| | | | Type| Type| Type| Elementwise| Elementwise| Elementwise| Spec| per| | | | | | | tile| tile| tile| size| size| size| smem| pipeline|
|
||||
//###################################|Spatial| | | | | | | Operation| Operation| Operation| | CU| | | | | | | size| size| size| A| B| C| buffer| version|
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>, // prob this
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
// clang-format on
|
||||
>;
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 32, 4, 1, 1,32, 32, 16, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 128, 32, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 2,2, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 2,2, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 64, 4, 1, 1,16, 16, 32, 16, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>, // prob this
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 4, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 16, 4, 1, 1,16, 16, 16, 4, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 16, 64, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 8, 1,1, true, CK_TILE_PIPELINE_COMPUTE_V4>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_MEMORY>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, false, CK_TILE_PIPELINE_COMPUTE_V3>,
|
||||
// GroupedConvolutionBackwardDataInvoker<NDimSpatial, ALayout, BLayout, ELayout, BF16, BF16, BF16, PassThrough, PassThrough, PassThrough, ConvolutionSpecialization::Default, 1, 64, 16, 32, 1, 1, 1,16, 16, 32, 1, 8,4, true, CK_TILE_PIPELINE_COMPUTE_V4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include "ck_tile/ops/epilogue.hpp"
|
||||
#include "ck_tile/ops/gemm.hpp"
|
||||
#include "ck_tile/ops/grouped_convolution.hpp"
|
||||
#include "ck_tile/ops/grouped_convolution.hpp"
|
||||
#include "ck_tile/library/tensor_operation_instance/gpu/gemm_configs.hpp"
|
||||
|
||||
namespace ck_tile {
|
||||
@@ -31,192 +31,209 @@ template <ck_tile::index_t NDimSpatial,
|
||||
typename ComputeTypeB = ComputeTypeA>
|
||||
struct GroupedConvolutionBackwardDataBaseInvoker
|
||||
{
|
||||
virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0;
|
||||
virtual float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, bool time_kernel, int n_warmup, int n_repeat) const = 0;
|
||||
virtual std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0;
|
||||
virtual bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0;
|
||||
virtual float Run(const ck_tile::GroupedConvBwdDataHostArgs& args,
|
||||
bool time_kernel,
|
||||
int n_warmup,
|
||||
int n_repeat) const = 0;
|
||||
virtual std::string GetName(const ck_tile::GroupedConvBwdDataHostArgs& args) const = 0;
|
||||
GroupedConvolutionBackwardDataBaseInvoker() = default;
|
||||
GroupedConvolutionBackwardDataBaseInvoker(const GroupedConvolutionBackwardDataBaseInvoker&) = default;
|
||||
GroupedConvolutionBackwardDataBaseInvoker& operator=(const GroupedConvolutionBackwardDataBaseInvoker&) = default;
|
||||
GroupedConvolutionBackwardDataBaseInvoker(GroupedConvolutionBackwardDataBaseInvoker&&) = default;
|
||||
GroupedConvolutionBackwardDataBaseInvoker& operator=(GroupedConvolutionBackwardDataBaseInvoker&&) = default;
|
||||
virtual ~GroupedConvolutionBackwardDataBaseInvoker() = default;
|
||||
GroupedConvolutionBackwardDataBaseInvoker(const GroupedConvolutionBackwardDataBaseInvoker&) =
|
||||
default;
|
||||
GroupedConvolutionBackwardDataBaseInvoker&
|
||||
operator=(const GroupedConvolutionBackwardDataBaseInvoker&) = default;
|
||||
GroupedConvolutionBackwardDataBaseInvoker(GroupedConvolutionBackwardDataBaseInvoker&&) =
|
||||
default;
|
||||
GroupedConvolutionBackwardDataBaseInvoker&
|
||||
operator=(GroupedConvolutionBackwardDataBaseInvoker&&) = default;
|
||||
virtual ~GroupedConvolutionBackwardDataBaseInvoker() = default;
|
||||
};
|
||||
|
||||
template <
|
||||
ck_tile::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ck_tile::ConvolutionSpecialization ConvSpec,
|
||||
int kBlockPerCu,
|
||||
ck_tile::index_t M_Tile,
|
||||
ck_tile::index_t N_Tile,
|
||||
ck_tile::index_t K_Tile,
|
||||
ck_tile::index_t M_Warp,
|
||||
ck_tile::index_t N_Warp,
|
||||
ck_tile::index_t K_Warp,
|
||||
ck_tile::index_t M_Warp_Tile,
|
||||
ck_tile::index_t N_Warp_Tile,
|
||||
ck_tile::index_t K_Warp_Tile,
|
||||
ck_tile::index_t VectorSizeA,
|
||||
ck_tile::index_t VectorSizeB,
|
||||
ck_tile::index_t VectorSizeC,
|
||||
bool DoubleSmemBuffer,
|
||||
ck_tile::index_t PipelineVersion>
|
||||
struct GroupedConvolutionBackwardDataInvoker :
|
||||
public GroupedConvolutionBackwardDataBaseInvoker<NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
template <ck_tile::index_t NDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename OutDataType,
|
||||
typename InElementwiseOperation,
|
||||
typename WeiElementwiseOperation,
|
||||
typename OutElementwiseOperation,
|
||||
ck_tile::ConvolutionSpecialization ConvSpec,
|
||||
int kBlockPerCu,
|
||||
ck_tile::index_t M_Tile,
|
||||
ck_tile::index_t N_Tile,
|
||||
ck_tile::index_t K_Tile,
|
||||
ck_tile::index_t M_Warp,
|
||||
ck_tile::index_t N_Warp,
|
||||
ck_tile::index_t K_Warp,
|
||||
ck_tile::index_t M_Warp_Tile,
|
||||
ck_tile::index_t N_Warp_Tile,
|
||||
ck_tile::index_t K_Warp_Tile,
|
||||
ck_tile::index_t VectorSizeA,
|
||||
ck_tile::index_t VectorSizeB,
|
||||
ck_tile::index_t VectorSizeC,
|
||||
bool DoubleSmemBuffer,
|
||||
ck_tile::index_t PipelineVersion>
|
||||
struct GroupedConvolutionBackwardDataInvoker
|
||||
: public GroupedConvolutionBackwardDataBaseInvoker<NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType,
|
||||
InElementwiseOperation,
|
||||
WeiElementwiseOperation,
|
||||
OutElementwiseOperation>
|
||||
{
|
||||
using GemmShape = ck_tile::TileGemmShape<
|
||||
ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
|
||||
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
|
||||
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>,
|
||||
GemmConfigBase::PermuteA,
|
||||
GemmConfigBase::PermuteB>;
|
||||
using GemmShape =
|
||||
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
|
||||
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
|
||||
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>,
|
||||
GemmConfigBase::PermuteA,
|
||||
GemmConfigBase::PermuteB>;
|
||||
|
||||
//static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
|
||||
// static constexpr auto ConvSpec = ck_tile::ConvolutionSpecialization::Default;
|
||||
|
||||
using TilePartitioner =
|
||||
ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
|
||||
GemmConfigBase::TileParitionerGroupNum,
|
||||
GemmConfigBase::TileParitionerM01>;
|
||||
ck_tile::GemmSpatiallyLocalTilePartitioner<GemmShape,
|
||||
GemmConfigBase::TileParitionerGroupNum,
|
||||
GemmConfigBase::TileParitionerM01>;
|
||||
|
||||
using GroupedConvTraitsType = ck_tile::GroupedConvTraits<NDimSpatial,
|
||||
ConvSpec,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck_tile::tuple<>, // = DsLayout
|
||||
OutLayout,
|
||||
VectorSizeA,
|
||||
VectorSizeB,
|
||||
VectorSizeC>;
|
||||
ConvSpec,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck_tile::tuple<>, // = DsLayout
|
||||
OutLayout,
|
||||
VectorSizeA,
|
||||
VectorSizeB,
|
||||
VectorSizeC>;
|
||||
|
||||
using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<
|
||||
GemmConfigBase::kPadM,
|
||||
GemmConfigBase::kPadN,
|
||||
GemmConfigBase::kPadK,
|
||||
DoubleSmemBuffer,
|
||||
typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::AsLayout,
|
||||
typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::BsLayout,
|
||||
typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData::CLayout,
|
||||
GemmConfigBase::TransposeC,
|
||||
GemmConfigBase::UseStructuredSparsity,
|
||||
false, // Persistent,
|
||||
GemmConfigBase::NumWaveGroups>;
|
||||
GemmConfigBase::kPadM,
|
||||
GemmConfigBase::kPadN,
|
||||
GemmConfigBase::kPadK,
|
||||
DoubleSmemBuffer,
|
||||
typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::AsLayout,
|
||||
typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::BsLayout,
|
||||
typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>::CLayout,
|
||||
GemmConfigBase::TransposeC,
|
||||
GemmConfigBase::UseStructuredSparsity,
|
||||
false, // Persistent,
|
||||
GemmConfigBase::NumWaveGroups>;
|
||||
|
||||
using AccDataType = float;
|
||||
using AccDataType = float;
|
||||
using GemmPipelineProblem = ck_tile::GemmPipelineProblem<
|
||||
OutDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
GemmShape,
|
||||
typename GroupedConvTraitsType::GroupedConvImplicitGemmTraitsBwdData,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
InDataType,
|
||||
true,
|
||||
VectorSizeA,
|
||||
VectorSizeB>;
|
||||
OutDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
GemmShape,
|
||||
typename GroupedConvTraitsType::template GroupedConvImplicitGemmTraitsBwdData<1>,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
InDataType,
|
||||
true,
|
||||
VectorSizeA,
|
||||
VectorSizeB>;
|
||||
|
||||
using BaseGemmPipeline = typename PipelineTypeTraits<
|
||||
PipelineVersion>::template UniversalGemmPipeline<GemmPipelineProblem>;
|
||||
|
||||
using BaseGemmPipeline = typename PipelineTypeTraits<PipelineVersion>::template UniversalGemmPipeline<GemmPipelineProblem>;
|
||||
|
||||
template <bool HasHotLoop, ck_tile::TailNumber TailNumber, ck_tile::memory_operation_enum MemOp>
|
||||
auto CreateKernel() const
|
||||
{
|
||||
constexpr auto scheduler = GemmConfigBase::Scheduler;
|
||||
|
||||
|
||||
using UniversalGemmProblem =
|
||||
ck_tile::UniversalGemmPipelineProblem<OutDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
GemmShape,
|
||||
GemmUniversalTraits,
|
||||
scheduler,
|
||||
HasHotLoop,
|
||||
TailNumber,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
InDataType,
|
||||
true,
|
||||
VectorSizeA,
|
||||
VectorSizeB>;
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
GemmShape,
|
||||
GemmUniversalTraits,
|
||||
scheduler,
|
||||
// HasHotLoop,
|
||||
// TailNumber,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
ck_tile::element_wise::PassThrough,
|
||||
InDataType,
|
||||
true,
|
||||
VectorSizeA,
|
||||
VectorSizeB>;
|
||||
|
||||
using GemmPipeline = typename PipelineTypeTraits<PipelineVersion>::template GemmPipeline<UniversalGemmProblem>;
|
||||
using GemmPipeline = typename PipelineTypeTraits<PipelineVersion>::template GemmPipeline<
|
||||
UniversalGemmProblem>;
|
||||
|
||||
using CDEElementWise = ck_tile::element_wise::PassThrough;
|
||||
|
||||
using ConvEpilogue = ck_tile::CShuffleEpilogue<ck_tile::CShuffleEpilogueProblem<
|
||||
OutDataType,
|
||||
WeiDataType,
|
||||
ck_tile::tuple<>, // = DsDataType
|
||||
AccDataType,
|
||||
InDataType,
|
||||
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
|
||||
ck_tile::tensor_layout::gemm::RowMajor,
|
||||
CDEElementWise,
|
||||
TilePartitioner::MPerBlock,
|
||||
TilePartitioner::NPerBlock,
|
||||
M_Warp,
|
||||
N_Warp,
|
||||
M_Warp_Tile,
|
||||
N_Warp_Tile,
|
||||
K_Warp_Tile,
|
||||
GemmConfigBase::TransposeC,
|
||||
MemOp,
|
||||
1,
|
||||
true,
|
||||
GroupedConvTraitsType::VectorSizeC>>;
|
||||
using ConvEpilogue = ck_tile::CShuffleEpilogue<
|
||||
ck_tile::CShuffleEpilogueProblem<OutDataType,
|
||||
WeiDataType,
|
||||
ck_tile::tuple<>, // = DsDataType
|
||||
AccDataType,
|
||||
InDataType,
|
||||
typename GroupedConvTraitsType::ImplicitGemmDsLayout,
|
||||
ck_tile::tensor_layout::gemm::RowMajor,
|
||||
CDEElementWise,
|
||||
TilePartitioner::MPerBlock,
|
||||
TilePartitioner::NPerBlock,
|
||||
M_Warp,
|
||||
N_Warp,
|
||||
M_Warp_Tile,
|
||||
N_Warp_Tile,
|
||||
K_Warp_Tile,
|
||||
GemmConfigBase::TransposeC,
|
||||
MemOp,
|
||||
1,
|
||||
true,
|
||||
GroupedConvTraitsType::VectorSizeC>>;
|
||||
|
||||
return ck_tile::GroupedConvolutionBackwardDataKernel<GroupedConvTraitsType,
|
||||
TilePartitioner,
|
||||
GemmPipeline,
|
||||
ConvEpilogue>{};
|
||||
TilePartitioner,
|
||||
GemmPipeline,
|
||||
ConvEpilogue>{};
|
||||
}
|
||||
|
||||
bool IsSupportedArgument(const ck_tile::GroupedConvBwdDataHostArgs& args) const override
|
||||
{
|
||||
if (args.k_batch > 1)
|
||||
if(args.k_batch > 1)
|
||||
{
|
||||
using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::atomic_add>());
|
||||
return Kernel::IsSupportedArgument(args);
|
||||
using Kernel = decltype(CreateKernel<false,
|
||||
ck_tile::TailNumber::Empty,
|
||||
ck_tile::memory_operation_enum::atomic_add>());
|
||||
return Kernel::IsSupportedArgument(args);
|
||||
}
|
||||
using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::set>());
|
||||
using Kernel = decltype(CreateKernel<false,
|
||||
ck_tile::TailNumber::Empty,
|
||||
ck_tile::memory_operation_enum::set>());
|
||||
return Kernel::IsSupportedArgument(args);
|
||||
};
|
||||
|
||||
float Run(const ck_tile::GroupedConvBwdDataHostArgs& args, bool time_kernel, int n_warmup=5, int n_repeat=50) const override
|
||||
float Run(const ck_tile::GroupedConvBwdDataHostArgs& args,
|
||||
bool time_kernel,
|
||||
int n_warmup = 5,
|
||||
int n_repeat = 50) const override
|
||||
{
|
||||
[[maybe_unused]] ck_tile::index_t KGroups = 1;
|
||||
for(int i=0; i<args.num_dim_spatial_;i++) {
|
||||
KGroups *= args.filter_spatial_lengths_[i];//std::min(args.filter_spatial_lengths_[i], args.conv_filter_strides_[i]);
|
||||
[[maybe_unused]] ck_tile::index_t KGroups = 1;
|
||||
for(int i = 0; i < args.num_dim_spatial_; i++)
|
||||
{
|
||||
KGroups *= args.filter_spatial_lengths_[i]; // std::min(args.filter_spatial_lengths_[i],
|
||||
// args.conv_filter_strides_[i]);
|
||||
}
|
||||
|
||||
const index_t ConvStrideH = args.conv_filter_strides_[0];
|
||||
const index_t ConvStrideW = args.conv_filter_strides_[1];
|
||||
const index_t ConvDilationH = args.conv_filter_dilations_[0];
|
||||
const index_t ConvDilationW = args.conv_filter_dilations_[1];
|
||||
const auto GcdStrideDilationH = gcd(ConvStrideH, ConvDilationH);
|
||||
const auto GcdStrideDilationW = gcd(ConvStrideW, ConvDilationW);
|
||||
const auto YTilde = ConvStrideH / GcdStrideDilationH;
|
||||
const auto XTilde = ConvStrideW / GcdStrideDilationW;
|
||||
const auto Y = args.filter_spatial_lengths_[0];
|
||||
const auto X = args.filter_spatial_lengths_[1];
|
||||
[[maybe_unused]] const auto YDot = integer_divide_ceil(Y, YTilde);
|
||||
[[maybe_unused]] const auto XDot = integer_divide_ceil(X, XTilde);
|
||||
const index_t ConvStrideH = args.conv_filter_strides_[0];
|
||||
const index_t ConvStrideW = args.conv_filter_strides_[1];
|
||||
const index_t ConvDilationH = args.conv_filter_dilations_[0];
|
||||
const index_t ConvDilationW = args.conv_filter_dilations_[1];
|
||||
const auto GcdStrideDilationH = gcd(ConvStrideH, ConvDilationH);
|
||||
const auto GcdStrideDilationW = gcd(ConvStrideW, ConvDilationW);
|
||||
const auto YTilde = ConvStrideH / GcdStrideDilationH;
|
||||
const auto XTilde = ConvStrideW / GcdStrideDilationW;
|
||||
const auto Y = args.filter_spatial_lengths_[0];
|
||||
const auto X = args.filter_spatial_lengths_[1];
|
||||
[[maybe_unused]] const auto YDot = integer_divide_ceil(Y, YTilde);
|
||||
[[maybe_unused]] const auto XDot = integer_divide_ceil(X, XTilde);
|
||||
|
||||
const ck_tile::index_t gemm_k = args.K_ * XDot * YDot;
|
||||
|
||||
@@ -229,29 +246,28 @@ struct GroupedConvolutionBackwardDataInvoker :
|
||||
|
||||
printf("gemm_k: %d num_loop: %d, Xdot: %ld YDot: %ld\n", gemm_k, num_loop, XDot, YDot);
|
||||
|
||||
const auto Run = [&](const auto has_hot_loop_,
|
||||
const auto tail_number_,
|
||||
const auto memory_operation_) {
|
||||
constexpr bool has_hot_loop_v = has_hot_loop_.value;
|
||||
constexpr auto tail_number_v = tail_number_.value;
|
||||
constexpr auto memory_operation = memory_operation_.value;
|
||||
const auto Run =
|
||||
[&](const auto has_hot_loop_, const auto tail_number_, const auto memory_operation_) {
|
||||
constexpr bool has_hot_loop_v = has_hot_loop_.value;
|
||||
constexpr auto tail_number_v = tail_number_.value;
|
||||
constexpr auto memory_operation = memory_operation_.value;
|
||||
|
||||
auto kernel = CreateKernel<has_hot_loop_v, tail_number_v, memory_operation>();
|
||||
using Kernel = decltype(kernel);
|
||||
|
||||
auto kargs = Kernel::MakeKernelArgs(args);
|
||||
const dim3 grids = Kernel::GridSize(args);
|
||||
const dim3 blocks = Kernel::BlockSize();
|
||||
auto kernel = CreateKernel<has_hot_loop_v, tail_number_v, memory_operation>();
|
||||
using Kernel = decltype(kernel);
|
||||
|
||||
printf("grid: (%u, %u, %u)\n", grids.x, grids.y, grids.z);
|
||||
|
||||
ck_tile::stream_config s {nullptr, time_kernel, 1, n_warmup, n_repeat};
|
||||
auto kargs = Kernel::MakeKernelArgs(args);
|
||||
const dim3 grids = Kernel::GridSize(args);
|
||||
const dim3 blocks = Kernel::BlockSize();
|
||||
|
||||
ave_time = ck_tile::launch_kernel(
|
||||
s, ck_tile::make_kernel<kBlockPerCu>(kernel, grids, blocks, 0, kargs));
|
||||
printf("grid: (%u, %u, %u)\n", grids.x, grids.y, grids.z);
|
||||
|
||||
return ave_time;
|
||||
};
|
||||
ck_tile::stream_config s{nullptr, time_kernel, 1, n_warmup, n_repeat};
|
||||
|
||||
ave_time = ck_tile::launch_kernel(
|
||||
s, ck_tile::make_kernel<kBlockPerCu>(kernel, grids, blocks, 0, kargs));
|
||||
|
||||
return ave_time;
|
||||
};
|
||||
|
||||
const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
|
||||
if(args.k_batch == 1)
|
||||
@@ -272,22 +288,28 @@ struct GroupedConvolutionBackwardDataInvoker :
|
||||
{
|
||||
std::stringstream min_occupancy;
|
||||
min_occupancy << "_blk_per_cu_" << kBlockPerCu;
|
||||
if (args.k_batch > 1)
|
||||
if(args.k_batch > 1)
|
||||
{
|
||||
using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::atomic_add>());
|
||||
return Kernel::GetName() + min_occupancy.str();
|
||||
using Kernel = decltype(CreateKernel<false,
|
||||
ck_tile::TailNumber::Empty,
|
||||
ck_tile::memory_operation_enum::atomic_add>());
|
||||
return Kernel::GetName() + min_occupancy.str();
|
||||
}
|
||||
using Kernel = decltype(CreateKernel<false, ck_tile::TailNumber::Empty, ck_tile::memory_operation_enum::set>());
|
||||
using Kernel = decltype(CreateKernel<false,
|
||||
ck_tile::TailNumber::Empty,
|
||||
ck_tile::memory_operation_enum::set>());
|
||||
return Kernel::GetName() + min_occupancy.str();
|
||||
};
|
||||
|
||||
GroupedConvolutionBackwardDataInvoker() = default;
|
||||
GroupedConvolutionBackwardDataInvoker() = default;
|
||||
GroupedConvolutionBackwardDataInvoker(const GroupedConvolutionBackwardDataInvoker&) = default;
|
||||
GroupedConvolutionBackwardDataInvoker& operator=(const GroupedConvolutionBackwardDataInvoker&) = default;
|
||||
GroupedConvolutionBackwardDataInvoker&
|
||||
operator=(const GroupedConvolutionBackwardDataInvoker&) = default;
|
||||
GroupedConvolutionBackwardDataInvoker(GroupedConvolutionBackwardDataInvoker&&) = default;
|
||||
GroupedConvolutionBackwardDataInvoker& operator=(GroupedConvolutionBackwardDataInvoker&&) = default;
|
||||
~GroupedConvolutionBackwardDataInvoker() override = default;
|
||||
};
|
||||
GroupedConvolutionBackwardDataInvoker&
|
||||
operator=(GroupedConvolutionBackwardDataInvoker&&) = default;
|
||||
~GroupedConvolutionBackwardDataInvoker() override = default;
|
||||
};
|
||||
|
||||
}
|
||||
}
|
||||
} // namespace ops
|
||||
} // namespace ck_tile
|
||||
|
||||
@@ -6,4 +6,3 @@ include_directories(BEFORE
|
||||
)
|
||||
|
||||
add_subdirectory(src)
|
||||
add_subdirectory(ck_tile)
|
||||
|
||||
Reference in New Issue
Block a user