[CK TILE] Grouped Convolution Forward Kernel

This commit is contained in:
Bartlomiej Kocot
2025-05-13 11:52:11 +00:00
parent 3448e12609
commit aaca2e2b08
12 changed files with 2970 additions and 0 deletions

View File

@@ -0,0 +1,4 @@
add_executable(tile_example_grouped_conv_fwd EXCLUDE_FROM_ALL grouped_convolution_forward.cpp)
set(EXAMPLE_CONV_COMPILE_OPTIONS)
list(APPEND EXAMPLE_CONV_COMPILE_OPTIONS -mllvm -enable-noalias-to-md-conversion=0)
target_compile_options(tile_example_grouped_conv_fwd PRIVATE ${EXAMPLE_GEMM_COMPILE_OPTIONS})

View File

@@ -0,0 +1,190 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#include <hip/hip_runtime.h>
#include <cstring>
#include <iostream>
#include <ostream>
#include <string>
#include <tuple>
#include "ck_tile/host.hpp"
#include "grouped_convolution_utils.hpp"
template <ck_tile::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename AccDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
float grouped_conv_fwd_calc(const ck_tile::GroupedConvHostArgs& args,
const ck_tile::stream_config& s)
{
constexpr int kBlockPerCu = 1;
// This part comes from the Codegen
constexpr ck_tile::index_t M_Tile = 64;
constexpr ck_tile::index_t N_Tile = 64;
constexpr ck_tile::index_t K_Tile = 64;
constexpr ck_tile::index_t M_Warp = 2;
constexpr ck_tile::index_t N_Warp = 2;
constexpr ck_tile::index_t K_Warp = 1;
constexpr ck_tile::index_t M_Warp_Tile = 32;
constexpr ck_tile::index_t N_Warp_Tile = 32;
constexpr ck_tile::index_t K_Warp_Tile = 16;
// Implicit GEMM Traits
using CodegenShape =
ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
using TilePartitioner = ck_tile::GemmTile1DPartitioner<CodegenShape>;
using CodegenTraits = ck_tile::GroupedConvImplicitGemmTraits;
using CodegenPipelineProblem = ck_tile::
GemmPipelineProblem<InDataType, WeiDataType, AccDataType, CodegenShape, CodegenTraits>;
using CodegenPipeline = ck_tile::GemmPipelineAGmemBGmemCRegV1<CodegenPipelineProblem>;
const auto Run = [&](const auto memory_operation_) {
constexpr auto memory_operation = memory_operation_.value;
using ConvEpilogue = ck_tile::CShuffleEpilogue<
ck_tile::CShuffleEpilogueProblem<InDataType,
WeiDataType,
AccDataType,
OutDataType,
ck_tile::tensor_layout::gemm::RowMajor,
CodegenPipelineProblem::kBlockSize,
TilePartitioner::MPerBlock,
TilePartitioner::NPerBlock,
M_Warp,
N_Warp,
M_Warp_Tile,
N_Warp_Tile,
K_Warp_Tile,
CodegenPipelineProblem::TransposeC,
memory_operation>>;
constexpr auto ConvSpec = ck_tile::ConvolutionForwardSpecialization::Default;
using Kernel = ck_tile::GroupedConvolutionForwardKernel<NDimSpatial,
ConvSpec,
InLayout,
WeiLayout,
OutLayout,
TilePartitioner,
CodegenPipeline,
ConvEpilogue>;
auto kargs = Kernel::MakeKernelArgs(args);
const dim3 grids = Kernel::GridSize(args);
constexpr dim3 blocks = Kernel::BlockSize();
if(!Kernel::IsSupportedArgument(kargs))
{
throw std::runtime_error("Wrong! Arguments not supported! Skipping conv!\n");
}
if(s.log_level_ > 0)
{
std::cout << "Launching kernel with args: " << Kernel::GetName() << '\n'
<< "shape: " << CodegenShape::GetName() << '\n'
<< "problem: " << CodegenPipelineProblem::GetName() << '\n'
<< "pipeline: " << CodegenPipeline::GetName() << '\n'
<< "grid: {" << grids.x << ", " << grids.y << ", " << grids.z << "}"
<< ", blocks: {" << blocks.x << ", " << blocks.y << ", " << blocks.z << "}"
<< std::endl;
}
float ave_time = ck_tile::launch_kernel(
s, ck_tile::make_kernel<blocks.x, kBlockPerCu>(Kernel{}, grids, blocks, 0, kargs));
return ave_time;
};
return Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
ck_tile::memory_operation_enum::set>{});
}
#include "run_grouped_convolution_example.inc"
template <typename InPrecType, typename WeiPrecType = InPrecType, typename OutPrecType = InPrecType>
int run_grouped_conv_fwd_example_prec_type(
std::string in_layout, std::string wei_layout, std::string out_layout, int argc, char* argv[])
{
using NWGC = ck_tile::tensor_layout::convolution::NWGC;
using NHWGC = ck_tile::tensor_layout::convolution::NHWGC;
using NDHWGC = ck_tile::tensor_layout::convolution::NDHWGC;
using GKXC = ck_tile::tensor_layout::convolution::GKXC;
using GKYXC = ck_tile::tensor_layout::convolution::GKYXC;
using GKZYXC = ck_tile::tensor_layout::convolution::GKZYXC;
using NWGK = ck_tile::tensor_layout::convolution::NWGK;
using NHWGK = ck_tile::tensor_layout::convolution::NHWGK;
using NDHWGK = ck_tile::tensor_layout::convolution::NDHWGK;
if(in_layout == "NWGC" && wei_layout == "GKXC" && out_layout == "NWGK")
{
return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<1>{},
InPrecType,
WeiPrecType,
OutPrecType>(
argc, argv, NWGC{}, GKXC{}, NWGK{});
}
else if(in_layout == "NHWGC" && wei_layout == "GKYXC" && out_layout == "NHWGK")
{
return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<2>{},
InPrecType,
WeiPrecType,
OutPrecType>(
argc, argv, NHWGC{}, GKYXC{}, NHWGK{});
}
else if(in_layout == "NDHWGC" && wei_layout == "GKZYXC" && out_layout == "GKZYXC")
{
return run_grouped_conv_fwd_example_with_layouts<ck_tile::number<3>{},
InPrecType,
WeiPrecType,
OutPrecType>(
argc, argv, NDHWGC{}, GKZYXC{}, NDHWGK{});
}
else
{
throw std::runtime_error("Unsupported memory layout!");
}
}
int run_grouped_conv_fwd_example(int argc, char* argv[])
{
auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;
std::string data_type = arg_parser.get_str("prec");
std::string in_layout = arg_parser.get_str("in_layout");
std::string wei_layout = arg_parser.get_str("wei_layout");
std::string out_layout = arg_parser.get_str("out_layout");
if(data_type == "fp16")
{
return run_grouped_conv_fwd_example_prec_type<ck_tile::half_t>(
in_layout, wei_layout, out_layout, argc, argv);
}
else if(data_type == "bf16")
{
return run_grouped_conv_fwd_example_prec_type<ck_tile::bf16_t>(
in_layout, wei_layout, out_layout, argc, argv);
}
else
{
throw std::runtime_error("Unsupported data type for this operation !!!");
}
}
int main(int argc, char* argv[]) { return !run_grouped_conv_fwd_example(argc, argv); }

View File

@@ -0,0 +1,104 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
#include "ck_tile/core.hpp"
#include "ck_tile/host/kernel_launch.hpp"
#include "ck_tile/ops/epilogue.hpp"
#include "ck_tile/ops/grouped_convolution.hpp"
ck_tile::index_t fill_spatial_dimensions(std::vector<ck_tile::index_t>& filter_spatial_lengths,
std::vector<ck_tile::index_t>& image_spatial_lengths,
std::vector<ck_tile::index_t>& strides,
std::vector<ck_tile::index_t>& dilations,
std::vector<ck_tile::index_t>& lpads,
std::vector<ck_tile::index_t>& rpads,
ck_tile::ArgParser& arg_parser)
{
constexpr ck_tile::index_t non_sp_dims = 3;
const ck_tile::index_t n_dim_sp = arg_parser.get_str("in_layout").size() - non_sp_dims;
assert(n_dim_sp >= 1 && n_dim_sp <= 3 && "Wrong layout!");
if(n_dim_sp == 3)
{
filter_spatial_lengths.push_back(arg_parser.get_int("z"));
image_spatial_lengths.push_back(arg_parser.get_int("d"));
strides.push_back(arg_parser.get_int("stride_d"));
dilations.push_back(arg_parser.get_int("dilation_d"));
lpads.push_back(arg_parser.get_int("lpad_d"));
rpads.push_back(arg_parser.get_int("rpad_d"));
}
if(n_dim_sp >= 2)
{
filter_spatial_lengths.push_back(arg_parser.get_int("y"));
image_spatial_lengths.push_back(arg_parser.get_int("h"));
strides.push_back(arg_parser.get_int("stride_h"));
dilations.push_back(arg_parser.get_int("dilation_h"));
lpads.push_back(arg_parser.get_int("lpad_h"));
rpads.push_back(arg_parser.get_int("rpad_h"));
}
filter_spatial_lengths.push_back(arg_parser.get_int("x"));
image_spatial_lengths.push_back(arg_parser.get_int("w"));
strides.push_back(arg_parser.get_int("stride_w"));
dilations.push_back(arg_parser.get_int("dilation_w"));
lpads.push_back(arg_parser.get_int("lpad_w"));
rpads.push_back(arg_parser.get_int("rpad_w"));
return n_dim_sp;
}
auto create_args(int argc, char* argv[])
{
ck_tile::ArgParser arg_parser;
arg_parser.insert("g", "2", "group dimension")
.insert("n", "32", "n dimension")
.insert("k", "32", "k dimension")
.insert("c", "32", "c dimension")
.insert("d", "64", "d dimension")
.insert("h", "64", "h dimension")
.insert("w", "64", "w dimension")
.insert("z", "4", "z dimension")
.insert("y", "4", "y dimension")
.insert("x", "4", "x dimension")
.insert("stride_d", "1", "d stride")
.insert("stride_h", "1", "h stride")
.insert("stride_w", "1", "w stride")
.insert("dilation_d", "1", "d dilation")
.insert("dilation_h", "1", "h dilation")
.insert("dilation_w", "1", "w dilation")
.insert("lpad_d", "0", "left pad for d dimension")
.insert("lpad_h", "0", "left pad for h dimension")
.insert("lpad_w", "0", "left pad for w dimension")
.insert("rpad_d", "0", "right pad for d dimension")
.insert("rpad_h", "0", "right pad for h dimension")
.insert("rpad_w", "0", "right pad for w dimension")
.insert("in_layout", "NHWGC", "Input image layout - NHWGC by default")
.insert("wei_layout", "GKYXC", "Weight layout - GKYXC by default")
.insert("out_layout", "NHWGK", "Output image layout - NHWGK by default")
.insert("v", "1", "0. No validation, 1. Validation on CPU, 2. Validation on GPU")
.insert("prec", "fp16", "data type. fp16/bf16/fp8/bf8")
.insert("warmup", "50", "number of iterations before benchmark the kernel")
.insert("repeat", "100", "number of iterations to benchmark the kernel")
.insert("timer", "gpu", "gpu:gpu timer, cpu:cpu timer")
.insert("split_k", "1", "splitK value")
.insert("init", "0", "0:random, 1:linear, 2:constant(1)");
bool result = arg_parser.parse(argc, argv);
return std::make_tuple(result, arg_parser);
}
// host API
float gemm_calc(const ck_tile::GroupedConvHostArgs& args, const ck_tile::stream_config& s);

View File

@@ -0,0 +1,205 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
template <typename InDataType, typename WeiDataType, typename AccDataType, typename OutDataType>
auto calculate_rtol_atol(const ck_tile::index_t GemmK,
const ck_tile::index_t kbatch,
const float max_accumulated_value)
{
using ComputeType =
std::conditional_t<sizeof(InDataType) < sizeof(WeiDataType), InDataType, WeiDataType>;
// Calculate thresholds
const auto rtol = ck_tile::get_relative_threshold<ComputeType, OutDataType, AccDataType>(
ck_tile::integer_divide_ceil(GemmK, kbatch));
const auto atol = ck_tile::get_absolute_threshold<ComputeType, OutDataType, AccDataType>(
max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(GemmK, kbatch));
// Calculate error due to split_k accumulation
const auto rtol_split_k =
ck_tile::get_relative_threshold<OutDataType, OutDataType, OutDataType>(kbatch);
const auto atol_split_k =
ck_tile::get_absolute_threshold<OutDataType, OutDataType, OutDataType>(
max_accumulated_value, kbatch);
// Use higher threshold
return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
}
template <ck_tile::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename AccDataType,
typename OutDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
float invoke_grouped_conv_fwd(ck_tile::GroupedConvHostArgs& args, int n_warmup, int n_repeat)
{
float ave_time = grouped_conv_fwd_calc<NDimSpatial,
InDataType,
WeiDataType,
AccDataType,
OutDataType,
InLayout,
WeiLayout,
OutLayout>(
args, ck_tile::stream_config{nullptr, true, 1, n_warmup, n_repeat});
std::size_t flop = args.GetFlops();
std::size_t num_byte = args.GetByte<InDataType, WeiDataType, OutDataType>();
float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
float gb_per_sec = num_byte / 1.E6 / ave_time;
std::cout << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
<< std::endl;
return ave_time;
}
template <ck_tile::index_t NDimSpatial,
typename InDataType,
typename WeiDataType = InDataType,
typename OutDataType = InDataType,
typename InLayout,
typename WeiLayout,
typename OutLayout>
int run_grouped_conv_fwd_example_with_layouts(
int argc, char* argv[], const InLayout, const WeiLayout, const OutLayout)
{
auto [result, arg_parser] = create_args(argc, argv);
if(!result)
return -1;
using AccDataType = float;
std::vector<ck_tile::index_t> filter_spatial_lengths;
std::vector<ck_tile::index_t> image_spatial_lengths;
std::vector<ck_tile::index_t> strides;
std::vector<ck_tile::index_t> dilations;
std::vector<ck_tile::index_t> lpads;
std::vector<ck_tile::index_t> rpads;
const ck_tile::index_t num_dim_sp = fill_spatial_dimensions(filter_spatial_lengths,
image_spatial_lengths,
strides,
dilations,
lpads,
rpads,
arg_parser);
ck_tile::conv::ConvParam conv_param{num_dim_sp,
arg_parser.get_int("g"),
arg_parser.get_int("n"),
arg_parser.get_int("k"),
arg_parser.get_int("c"),
filter_spatial_lengths,
image_spatial_lengths,
strides,
dilations,
lpads,
rpads};
ck_tile::index_t kbatch = arg_parser.get_int("split_k");
int n_warmup = arg_parser.get_int("warmup");
int n_repeat = arg_parser.get_int("repeat");
ck_tile::index_t init_method = arg_parser.get_int("init");
const auto in_g_n_c_wis_desc =
ck_tile::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<InLayout>(conv_param);
const auto wei_g_k_c_xs_desc =
ck_tile::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<WeiLayout>(conv_param);
const auto out_g_n_k_wos_desc =
ck_tile::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<OutLayout>(conv_param);
ck_tile::HostTensor<InDataType> input(in_g_n_c_wis_desc);
ck_tile::HostTensor<WeiDataType> weight(wei_g_k_c_xs_desc);
ck_tile::HostTensor<OutDataType> output(out_g_n_k_wos_desc);
if(init_method == 0)
{
ck_tile::FillUniformDistribution<InDataType>{-1.f, 1.f}(input);
ck_tile::FillUniformDistribution<WeiDataType>{-1.f, 1.f}(weight);
}
else if(init_method == 1)
{
ck_tile::FillMonotonicSeq<InDataType>{}(input);
ck_tile::FillMonotonicSeq<WeiDataType>{}(weight);
}
else if(init_method == 2)
{
ck_tile::FillUniformDistribution<InDataType>{1.f, 1.f}(input);
ck_tile::FillUniformDistribution<WeiDataType>{1.f, 1.f}(weight);
}
else
{
input.SetZero();
weight.SetZero();
}
ck_tile::DeviceMem input_dev_buf(input.get_element_space_size_in_bytes());
ck_tile::DeviceMem weight_dev_buf(weight.get_element_space_size_in_bytes());
ck_tile::DeviceMem output_dev_buf(output.get_element_space_size_in_bytes());
input_dev_buf.ToDevice(input.data());
weight_dev_buf.ToDevice(weight.data());
output_dev_buf.SetZero();
ck_tile::GroupedConvHostArgs args(conv_param,
input_dev_buf.GetDeviceBuffer(),
weight_dev_buf.GetDeviceBuffer(),
output_dev_buf.GetDeviceBuffer(),
kbatch);
std::cout << "Run Grouped Conv Fwd kernel" << std::endl;
std::cout << "input: " << input.mDesc << std::endl;
std::cout << "weight: " << weight.mDesc << std::endl;
std::cout << "output: " << output.mDesc << std::endl;
invoke_grouped_conv_fwd<NDimSpatial,
InDataType,
WeiDataType,
AccDataType,
OutDataType,
InLayout,
WeiLayout,
OutLayout>(args, n_warmup, n_repeat);
output_dev_buf.FromDevice(output.data());
bool pass = true;
if(arg_parser.get_int("v") == 1)
{
ck_tile::HostTensor<OutDataType> output_host_ref(out_g_n_k_wos_desc);
output_host_ref.SetZero();
ck_tile::reference_grouped_conv_fwd<NDimSpatial, InDataType, WeiDataType, OutDataType>(
input,
weight,
output_host_ref,
conv_param.conv_filter_strides_,
conv_param.conv_filter_dilations_,
conv_param.input_left_pads_,
conv_param.input_right_pads_);
const ck_tile::index_t GemmK = weight.get_element_size() / (conv_param.G_ * conv_param.K_);
const float max_accumulated_value =
*std::max_element(output_host_ref.mData.begin(), output_host_ref.mData.end());
const auto rtol_atol =
calculate_rtol_atol<InDataType, WeiDataType, AccDataType, OutDataType>(
GemmK, kbatch, max_accumulated_value);
pass = ck_tile::check_err(output,
output_host_ref,
"Error: Incorrect results!",
rtol_atol.at(ck_tile::number<0>{}),
rtol_atol.at(ck_tile::number<1>{}));
std::cout << "Relative error threshold: " << rtol_atol.at(ck_tile::number<0>{})
<< " Absolute error threshold: " << rtol_atol.at(ck_tile::number<1>{})
<< std::endl;
std::cout << "The CPU verification result is:" << (pass ? "correct" : "fail") << std::endl;
}
else if(arg_parser.get_int("v") == 2)
{
throw std::runtime_error("Unsupported gpu verification !!!");
}
return pass;
}

View File

@@ -20,3 +20,4 @@ add_subdirectory(17_grouped_gemm)
add_subdirectory(18_flatmm)
add_subdirectory(35_batched_transpose)
add_subdirectory(36_copy)
add_subdirectory(37_grouped_convolution)

View File

@@ -34,5 +34,6 @@
#include "ck_tile/host/reference/reference_rowwise_quantization2d.hpp"
#include "ck_tile/host/reference/reference_softmax.hpp"
#include "ck_tile/host/reference/reference_topk.hpp"
#include "ck_tile/host/reference/reference_grouped_conv_fwd.hpp"
#include "ck_tile/host/stream_config.hpp"
#include "ck_tile/host/timer.hpp"

View File

@@ -0,0 +1,165 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <cstdlib>
#include <thread>
#include "ck_tile/core.hpp"
#include "ck_tile/host/host_tensor.hpp"
namespace ck_tile {
template <ck_tile::index_t NDimSpatial,
typename InDataType,
typename WeiDataType,
typename OutDataType>
CK_TILE_HOST void reference_grouped_conv_fwd(const HostTensor<InDataType>& input,
const HostTensor<WeiDataType>& weight,
HostTensor<OutDataType>& output,
std::vector<ck_tile::long_index_t> conv_strides,
std::vector<ck_tile::long_index_t> conv_dilations,
std::vector<ck_tile::long_index_t> in_left_pads,
std::vector<ck_tile::long_index_t>)
{
if(!(input.get_num_of_dimension() == NDimSpatial + 3 &&
weight.get_num_of_dimension() == NDimSpatial + 3 &&
output.get_num_of_dimension() == NDimSpatial + 3))
{
throw std::runtime_error("wrong! inconsistent dimension");
}
if constexpr(NDimSpatial == 1)
{
auto func = [&](auto g, auto n, auto k, auto wo) {
float v_acc = 0;
for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
{
for(std::size_t x = 0; x < weight.get_lengths()[3]; ++x)
{
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[0]) +
static_cast<ck_tile::long_index_t>(x * conv_dilations[0]) -
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
if(wi >= 0 && ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[3])
{
InDataType v_in = input(g, n, c, wi);
WeiDataType v_wei = weight(g, k, c, x);
v_acc += ck_tile::type_convert<float>(v_in) *
ck_tile::type_convert<float>(v_wei);
}
}
}
OutDataType v_acc_converted = ck_tile::type_convert<OutDataType>(v_acc);
output(g, n, k, wo) = v_acc_converted;
};
make_ParallelTensorFunctor(func,
output.get_lengths()[0],
output.get_lengths()[1],
output.get_lengths()[2],
output.get_lengths()[3])(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 2)
{
auto func = [&](auto g, auto n, auto k, auto ho, auto wo) {
float v_acc = 0;
for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
{
for(std::size_t y = 0; y < weight.get_lengths()[3]; ++y)
{
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[0]) +
static_cast<ck_tile::long_index_t>(y * conv_dilations[0]) -
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
for(std::size_t x = 0; x < weight.get_lengths()[4]; ++x)
{
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[1]) +
static_cast<ck_tile::long_index_t>(x * conv_dilations[1]) -
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
if(hi >= 0 &&
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[3] &&
wi >= 0 &&
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[4])
{
InDataType v_in = input(g, n, c, hi, wi);
WeiDataType v_wei = weight(g, k, c, y, x);
v_acc += ck_tile::type_convert<float>(v_in) *
ck_tile::type_convert<float>(v_wei);
}
}
}
}
OutDataType v_acc_converted = ck_tile::type_convert<OutDataType>(v_acc);
output(g, n, k, ho, wo) = v_acc_converted;
};
make_ParallelTensorFunctor(func,
output.get_lengths()[0],
output.get_lengths()[1],
output.get_lengths()[2],
output.get_lengths()[3],
output.get_lengths()[4])(std::thread::hardware_concurrency());
}
else if constexpr(NDimSpatial == 3)
{
auto func = [&](auto g, auto n, auto k, auto d_o, auto ho, auto wo) {
float v_acc = 0;
for(std::size_t c = 0; c < weight.get_lengths()[2]; ++c)
{
for(std::size_t z = 0; z < weight.get_lengths()[3]; ++z)
{
auto di = static_cast<ck_tile::long_index_t>(d_o * conv_strides[0]) +
static_cast<ck_tile::long_index_t>(z * conv_dilations[0]) -
static_cast<ck_tile::long_index_t>(in_left_pads[0]);
for(std::size_t y = 0; y < weight.get_lengths()[4]; ++y)
{
auto hi = static_cast<ck_tile::long_index_t>(ho * conv_strides[1]) +
static_cast<ck_tile::long_index_t>(y * conv_dilations[1]) -
static_cast<ck_tile::long_index_t>(in_left_pads[1]);
for(std::size_t x = 0; x < weight.get_lengths()[5]; ++x)
{
auto wi = static_cast<ck_tile::long_index_t>(wo * conv_strides[2]) +
static_cast<ck_tile::long_index_t>(x * conv_dilations[2]) -
static_cast<ck_tile::long_index_t>(in_left_pads[2]);
if(di >= 0 &&
ck_tile::type_convert<std::size_t>(di) < input.get_lengths()[3] &&
hi >= 0 &&
ck_tile::type_convert<std::size_t>(hi) < input.get_lengths()[4] &&
wi >= 0 &&
ck_tile::type_convert<std::size_t>(wi) < input.get_lengths()[5])
{
InDataType v_in = input(g, n, c, di, hi, wi);
WeiDataType v_wei = weight(g, k, c, z, y, x);
v_acc += ck_tile::type_convert<float>(v_in) *
ck_tile::type_convert<float>(v_wei);
}
}
}
}
}
OutDataType v_acc_converted = ck_tile::type_convert<OutDataType>(v_acc);
output(g, n, k, d_o, ho, wo) = v_acc_converted;
};
make_ParallelTensorFunctor(func,
output.get_lengths()[0],
output.get_lengths()[1],
output.get_lengths()[2],
output.get_lengths()[3],
output.get_lengths()[4],
output.get_lengths()[5])(std::thread::hardware_concurrency());
}
else
{
throw std::runtime_error("Ref_Conv_fwd: number of dimensions must be between 1 and 3.");
}
}
} // namespace ck_tile

View File

@@ -0,0 +1,54 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_bgmem_creg_v1_default_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_custom_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_breg_creg_v1_default_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_one_warp_v1.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_custom_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v1_default_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_custom_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_areg_bsmem_creg_v2_default_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_custom_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_asmem_breg_creg_v1_default_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_custom_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_asmem_bsmem_creg_v1_default_policy.hpp"
#include "ck_tile/ops/gemm/block/block_gemm_problem.hpp"
#include "ck_tile/ops/gemm/block/block_universal_gemm_as_bs_cr.hpp"
#include "ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_base.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v3.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_comp_v4_default_policy.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_mem.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_ag_bg_cr_scheduler.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v1_default_policy.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_agmem_bgmem_creg_v2_default_policy.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_pipeline_problem.hpp"
#include "ck_tile/ops/gemm/pipeline/gemm_universal_pipeline_ag_bg_cr_policy.hpp"
#include "ck_tile/ops/gemm/pipeline/tile_gemm_shape.hpp"
#include "ck_tile/ops/gemm/pipeline/tile_gemm_traits.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_mfma_impl.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_attribute_smfmac_impl.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_dispatcher.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_impl.hpp"
#include "ck_tile/ops/gemm/warp/warp_gemm_smfmac_impl.hpp"
#include "ck_tile/ops/grouped_convolution/kernel/grouped_convolution_forward_kernel.hpp"
#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
#include "ck_tile/ops/grouped_convolution/utils/convolution_forward_specialization.hpp"
#include "ck_tile/ops/common/generic_2d_block_shape.hpp"
#include "ck_tile/ops/common/tensor_layout.hpp"
#include "ck_tile/ops/common/utils.hpp"

View File

@@ -0,0 +1,729 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <iostream>
#include <string>
#include "ck_tile/core.hpp"
#include "ck_tile/ops/common.hpp"
#include "ck_tile/host/concat.hpp"
#include "ck_tile/core/utility/env.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
#include "ck_tile/ops/grouped_convolution/utils/transform_conv_fwd_to_gemm.hpp"
#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
namespace ck_tile {
/// @brief The Grouped Convolution kernel device arguments.
template <index_t NDimSpatial,
ConvolutionForwardSpecialization ConvForwardSpecialization,
typename InLayout,
typename WeiLayout,
typename OutLayout>
struct GroupedConvFwdKernelArgs
{
using ConvToGemmFwdTransformer = TransformConvFwdToGemm<NDimSpatial, ConvForwardSpecialization>;
template <
typename InLay = InLayout,
typename WeiLay = WeiLayout,
typename OutLay = OutLayout,
typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NWGC> &&
std::is_same_v<WeiLay, tensor_layout::convolution::GKXC> &&
std::is_same_v<OutLay, tensor_layout::convolution::NWGK>,
bool>::type = false>
CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
{
in_g_n_c_wis_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.N_),
static_cast<index_t>(args.C_),
static_cast<index_t>(args.input_spatial_lengths_[0])};
wei_g_k_c_xs_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.K_),
static_cast<index_t>(args.C_),
static_cast<index_t>(args.filter_spatial_lengths_[0])};
out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.N_),
static_cast<index_t>(args.K_),
static_cast<index_t>(args.output_spatial_lengths_[0])};
conv_filter_strides = {static_cast<index_t>(args.conv_filter_strides_[0])};
conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0])};
input_left_pads = {static_cast<index_t>(args.input_left_pads_[0])};
input_right_pads = {static_cast<index_t>(args.input_right_pads_[0])};
k_batch = args.k_batch;
GemmM = args.N_ * args.output_spatial_lengths_[0];
GemmN = args.K_;
GemmK = args.C_ * args.filter_spatial_lengths_[0];
in_ptr = args.in_ptr;
wei_ptr = args.wei_ptr;
out_ptr = args.out_ptr;
ConvToGemmFwdTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
wei_g_k_c_xs_lengths,
out_g_n_k_wos_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
a_grid_desc_m_k = conv_to_gemm_transformer.template MakeADescriptor_M_K<InLayout>();
b_grid_desc_n_k = conv_to_gemm_transformer.template MakeBDescriptor_N_K<WeiLayout>();
c_grid_desc_m_n = conv_to_gemm_transformer.template MakeCDescriptor_M_N<OutLayout>();
group_stride_a = args.C_;
group_stride_b = args.K_ * args.C_ *
std::accumulate(args.filter_spatial_lengths_.begin(),
args.filter_spatial_lengths_.end(),
1,
std::multiplies<index_t>());
group_stride_c = args.K_;
}
template <
typename InLay = InLayout,
typename WeiLay = WeiLayout,
typename OutLay = OutLayout,
typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NHWGC> &&
std::is_same_v<WeiLay, tensor_layout::convolution::GKYXC> &&
std::is_same_v<OutLay, tensor_layout::convolution::NHWGK>,
bool>::type = false>
CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
{
in_g_n_c_wis_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.N_),
static_cast<index_t>(args.C_),
static_cast<index_t>(args.input_spatial_lengths_[0]),
static_cast<index_t>(args.input_spatial_lengths_[1])};
wei_g_k_c_xs_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.K_),
static_cast<index_t>(args.C_),
static_cast<index_t>(args.filter_spatial_lengths_[0]),
static_cast<index_t>(args.filter_spatial_lengths_[1])};
out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.N_),
static_cast<index_t>(args.K_),
static_cast<index_t>(args.output_spatial_lengths_[0]),
static_cast<index_t>(args.output_spatial_lengths_[1])};
conv_filter_strides = {static_cast<index_t>(args.conv_filter_strides_[0]),
static_cast<index_t>(args.conv_filter_strides_[1])};
conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
static_cast<index_t>(args.conv_filter_dilations_[1])};
input_left_pads = {static_cast<index_t>(args.input_left_pads_[0]),
static_cast<index_t>(args.input_left_pads_[1])};
input_right_pads = {static_cast<index_t>(args.input_right_pads_[0]),
static_cast<index_t>(args.input_right_pads_[1])};
k_batch = args.k_batch;
GemmM = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1];
GemmN = args.K_;
GemmK = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1];
in_ptr = args.in_ptr;
wei_ptr = args.wei_ptr;
out_ptr = args.out_ptr;
ConvToGemmFwdTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
wei_g_k_c_xs_lengths,
out_g_n_k_wos_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
a_grid_desc_m_k = conv_to_gemm_transformer.template MakeADescriptor_M_K<InLayout>();
b_grid_desc_n_k = conv_to_gemm_transformer.template MakeBDescriptor_N_K<WeiLayout>();
c_grid_desc_m_n = conv_to_gemm_transformer.template MakeCDescriptor_M_N<OutLayout>();
group_stride_a = args.C_;
group_stride_b = args.K_ * args.C_ *
std::accumulate(args.filter_spatial_lengths_.begin(),
args.filter_spatial_lengths_.end(),
1,
std::multiplies<index_t>());
group_stride_c = args.K_;
}
template <
typename InLay = InLayout,
typename WeiLay = WeiLayout,
typename OutLay = OutLayout,
typename std::enable_if<std::is_same_v<InLay, tensor_layout::convolution::NDHWGC> &&
std::is_same_v<WeiLay, tensor_layout::convolution::GKZYXC> &&
std::is_same_v<OutLay, tensor_layout::convolution::NDHWGK>,
bool>::type = false>
CK_TILE_HOST GroupedConvFwdKernelArgs(const GroupedConvHostArgs& args)
{
in_g_n_c_wis_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.N_),
static_cast<index_t>(args.C_),
static_cast<index_t>(args.input_spatial_lengths_[0]),
static_cast<index_t>(args.input_spatial_lengths_[1]),
static_cast<index_t>(args.input_spatial_lengths_[2])};
wei_g_k_c_xs_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.K_),
static_cast<index_t>(args.C_),
static_cast<index_t>(args.filter_spatial_lengths_[0]),
static_cast<index_t>(args.filter_spatial_lengths_[1]),
static_cast<index_t>(args.filter_spatial_lengths_[2])};
out_g_n_k_wos_lengths = {static_cast<index_t>(args.G_),
static_cast<index_t>(args.N_),
static_cast<index_t>(args.K_),
static_cast<index_t>(args.output_spatial_lengths_[0]),
static_cast<index_t>(args.output_spatial_lengths_[1]),
static_cast<index_t>(args.output_spatial_lengths_[2])};
conv_filter_strides = {static_cast<index_t>(args.conv_filter_strides_[0]),
static_cast<index_t>(args.conv_filter_strides_[1]),
static_cast<index_t>(args.conv_filter_strides_[2])};
conv_filter_dilations = {static_cast<index_t>(args.conv_filter_dilations_[0]),
static_cast<index_t>(args.conv_filter_dilations_[1]),
static_cast<index_t>(args.conv_filter_dilations_[2])};
input_left_pads = {static_cast<index_t>(args.input_left_pads_[0]),
static_cast<index_t>(args.input_left_pads_[1]),
static_cast<index_t>(args.input_left_pads_[2])};
input_right_pads = {static_cast<index_t>(args.input_right_pads_[0]),
static_cast<index_t>(args.input_right_pads_[1]),
static_cast<index_t>(args.input_right_pads_[2])};
k_batch = args.k_batch;
GemmM = args.N_ * args.output_spatial_lengths_[0] * args.output_spatial_lengths_[1] *
args.output_spatial_lengths_[2];
GemmN = args.K_;
GemmK = args.C_ * args.filter_spatial_lengths_[0] * args.filter_spatial_lengths_[1] *
args.filter_spatial_lengths_[2];
in_ptr = args.in_ptr;
wei_ptr = args.wei_ptr;
out_ptr = args.out_ptr;
ConvToGemmFwdTransformer conv_to_gemm_transformer{in_g_n_c_wis_lengths,
wei_g_k_c_xs_lengths,
out_g_n_k_wos_lengths,
conv_filter_strides,
conv_filter_dilations,
input_left_pads,
input_right_pads};
a_grid_desc_m_k = conv_to_gemm_transformer.template MakeADescriptor_M_K<InLayout>();
b_grid_desc_n_k = conv_to_gemm_transformer.template MakeBDescriptor_N_K<WeiLayout>();
c_grid_desc_m_n = conv_to_gemm_transformer.template MakeCDescriptor_M_N<OutLayout>();
group_stride_a = args.C_;
group_stride_b = args.K_ * args.C_ *
std::accumulate(args.filter_spatial_lengths_.begin(),
args.filter_spatial_lengths_.end(),
1,
std::multiplies<index_t>());
group_stride_c = args.K_;
}
using AGridDescMK = remove_cvref_t<decltype(
ConvToGemmFwdTransformer{}.template MakeADescriptor_M_K<InLayout>())>;
using BGridDescNK = remove_cvref_t<decltype(
ConvToGemmFwdTransformer{}.template MakeBDescriptor_N_K<WeiLayout>())>;
using CGridDescMN = remove_cvref_t<decltype(
ConvToGemmFwdTransformer{}.template MakeCDescriptor_M_N<OutLayout>())>;
static constexpr index_t NonSpatialDims = 3;
array<index_t, NonSpatialDims + NDimSpatial> in_g_n_c_wis_lengths;
array<index_t, NonSpatialDims + NDimSpatial> wei_g_k_c_xs_lengths;
array<index_t, NonSpatialDims + NDimSpatial> out_g_n_k_wos_lengths;
array<index_t, NDimSpatial> conv_filter_strides;
array<index_t, NDimSpatial> conv_filter_dilations;
array<index_t, NDimSpatial> input_left_pads;
array<index_t, NDimSpatial> input_right_pads;
index_t k_batch;
index_t GemmM;
index_t GemmN;
index_t GemmK;
const void* in_ptr;
const void* wei_ptr;
void* out_ptr;
AGridDescMK a_grid_desc_m_k;
BGridDescNK b_grid_desc_n_k;
CGridDescMN c_grid_desc_m_n;
long_index_t group_stride_a;
long_index_t group_stride_b;
long_index_t group_stride_c;
};
/// @brief The Grouped Convolution Forward kernel template.
///
/// @paragraph Overview Overview
/// This class provides the grouped convolution forward kernel template. By semantic
/// division of Implicit GEMM algorithm into following parts we achieve flexible,
/// versatile and robust kernel implementation.
///
/// @li @b Prolog - The start of GEMM kernel implementation in @ref operator()
/// function call operator" which determines the work scope of each workgroup.
/// @li @b GemmPipeline - The core part @a "heart" of matrix multiplication algorithm.
/// This is the place where each workgroup is loading data from global memory and
/// carrying out dot products.
/// @li @b Epilogue - The @a "final" part of matrix multiplication implementation
/// responsible for storing results to global memory. This is also the place where
/// any additional operator fusion may take place.
///
/// Additionally both @ref GemmPipeline_ "GemmPipeline" and @ref EpiloguePipeline_
/// "EpiloguePipeline" are parameterized with so called @a Policy which determines all
/// internal details of those functional parts. You can think of it like both gemm and
/// epilogue pipelines provides the control-flow logic controlled by policies. Moreover
/// the policy is responsible for definition of all necessary data layouts and thread's
/// work distribution.
///
/// @tparam NDimSpatial_ Number of spatial dimensions of input image.
/// @tparam ConvForwardSpecialization_ Tensor descriptors specialization.
/// @tparam TilePartitioner_ The type of class providing mapping of workgroup index into
/// the
/// output data tile to be calculated. It determines the
/// workgroup to data relationship (or in other words - which
/// data would be processed and calculated by which workgroup).
/// @tparam GemmPipeline_ The type of class which provides the core part of matrix
/// multiplication. This class should provide implementation of
/// data loading from global memory and performing block-wise
/// matrix multiplication. You can think of it as a work done by
/// single workgroup point of view.
/// @tparam EpiloguePipeline_ The type of class providing the final part of matrix
/// multiplication implementation. It is responsible for storing
/// results calculated by @ref GemmPipeline_ "GemmPipeline" to
/// the output C tensor in global memory.
template <index_t NDimSpatial_,
ConvolutionForwardSpecialization ConvForwardSpecialization_,
typename InLayout_,
typename WeiLayout_,
typename OutLayout_,
typename TilePartitioner_,
typename GemmPipeline_,
typename EpiloguePipeline_>
struct GroupedConvolutionForwardKernel
{
static constexpr index_t NDimSpatial = NDimSpatial_;
static constexpr ConvolutionForwardSpecialization ConvForwardSpecialization =
ConvForwardSpecialization_;
using TilePartitioner = remove_cvref_t<TilePartitioner_>;
using GemmPipeline = remove_cvref_t<GemmPipeline_>;
using EpiloguePipeline = remove_cvref_t<EpiloguePipeline_>;
using GemmALayout = remove_cvref_t<typename GemmPipeline::ALayout>;
using GemmBLayout = remove_cvref_t<typename GemmPipeline::BLayout>;
using GemmCLayout = remove_cvref_t<typename GemmPipeline::CLayout>;
using InLayout = remove_cvref_t<InLayout_>;
using WeiLayout = remove_cvref_t<WeiLayout_>;
using OutLayout = remove_cvref_t<OutLayout_>;
static constexpr index_t KernelBlockSize = GemmPipeline::BlockSize;
using InDataType = remove_cvref_t<typename GemmPipeline::ADataType>;
using WeiDataType = remove_cvref_t<typename GemmPipeline::BDataType>;
// Below type is actually accumulation data type - the output of block GEMM.
using OutDataType = remove_cvref_t<typename EpiloguePipeline::ODataType>;
using GroupedConvFwdKernelArgsSpecialized = GroupedConvFwdKernelArgs<NDimSpatial_,
ConvForwardSpecialization,
InLayout,
WeiLayout,
OutLayout>;
// TODO: Enable this
static constexpr bool IsSplitKSupported = false;
static constexpr auto I0 = number<0>();
static constexpr auto I1 = number<1>();
static constexpr auto I2 = number<2>();
static constexpr auto I3 = number<3>();
static_assert(GemmPipeline::kPadM && GemmPipeline::kPadN && GemmPipeline::kPadK,
"Not supported!");
static_assert(std::is_same_v<GemmALayout, tensor_layout::gemm::RowMajor>, "Not supported!");
static_assert(std::is_same_v<GemmBLayout, tensor_layout::gemm::ColumnMajor>, "Not supported!");
static_assert(std::is_same_v<GemmCLayout, tensor_layout::gemm::RowMajor>, "Not supported!");
[[nodiscard]] CK_TILE_HOST static const std::string GetName()
{
// clang-format off
return concat('_', "grouped_convolution_forward", gemm_prec_str<InDataType, WeiDataType>, GemmPipeline::GetName());
// clang-format on
}
CK_TILE_HOST static constexpr auto GridSize(const GroupedConvHostArgs& args)
{
const index_t GemmM = args.N_ * std::accumulate(args.output_spatial_lengths_.begin(),
args.output_spatial_lengths_.end(),
1,
std::multiplies<index_t>());
const index_t GemmN = args.K_;
return dim3(TilePartitioner::GridSize(GemmM, GemmN), args.G_, args.k_batch);
}
CK_TILE_HOST static constexpr auto BlockSize() { return dim3(KernelBlockSize); }
CK_TILE_HOST static constexpr GroupedConvFwdKernelArgsSpecialized
MakeKernelArgs(const GroupedConvHostArgs& hostArgs)
{
return GroupedConvFwdKernelArgsSpecialized(hostArgs);
}
CK_TILE_HOST_DEVICE static constexpr index_t GetSmemSize()
{
return max(GemmPipeline::GetSmemSize(), EpiloguePipeline::GetSmemSize());
}
CK_TILE_HOST static bool IsSupportedArgument(const GroupedConvFwdKernelArgsSpecialized& kargs)
{
if constexpr((EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
is_any_of<OutDataType, fp16_t, bf16_t>::value) ||
!IsSplitKSupported)
{
if(kargs.k_batch != 1)
{
if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
{
CK_TILE_ERROR("Conditions not met for Kbatch >1 !");
}
return false;
}
}
const index_t ConvK = kargs.wei_g_k_c_xs_lengths[number<1>{}];
const index_t ConvC = kargs.wei_g_k_c_xs_lengths[number<2>{}];
// check ConvolutionForwardSpecialization
if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Stride1Pad0)
{
// check if it's 1x1, stride=1 conv
for(index_t i = 0; i < NDimSpatial; ++i)
{
const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
const index_t ConvStride = kargs.conv_filter_strides[i];
const index_t LeftPad = kargs.input_left_pads[i];
const index_t RightPad = kargs.input_right_pads[i];
if(!(SpatialDim == 1 && ConvStride == 1 && LeftPad == 0 && RightPad == 0))
{
return false;
}
}
}
else if constexpr(ConvForwardSpecialization ==
ConvolutionForwardSpecialization::Filter1x1Pad0)
{
// check if it's 1x1 conv
for(index_t i = 0; i < NDimSpatial; ++i)
{
const index_t SpatialDim = kargs.wei_g_k_c_xs_lengths[i + 3];
const index_t LeftPad = kargs.input_left_pads[i];
const index_t RightPad = kargs.input_right_pads[i];
if(!(SpatialDim == 1 && LeftPad == 0 && RightPad == 0))
{
return false;
}
}
}
else if constexpr(ConvForwardSpecialization == ConvolutionForwardSpecialization::Filter3x3)
{
if(ConvC != 1)
{
return false;
}
for(index_t i = 0; i < NDimSpatial; ++i)
{
const index_t filter_spatial_dim = kargs.wei_g_k_c_xs_lengths[i + I3];
if(filter_spatial_dim != I3)
{
return false;
}
}
}
namespace ctc = tensor_layout::convolution;
if constexpr(std::is_same_v<InLayout, ctc::NWGC> || std::is_same_v<InLayout, ctc::NHWGC> ||
std::is_same_v<InLayout, ctc::NDHWGC>)
{
// Check access per C
if(ConvC % GemmPipeline::GetVectorSizeA() != 0)
{
CK_TILE_ERROR("Conv C is not a multiple of vector load size for input image!");
return false;
}
}
else
{
CK_TILE_ERROR("Not supported input layout!");
return false;
}
// check vector access of B
// FIXME: layout
if constexpr(std::is_same_v<WeiLayout, ctc::GKXC> ||
std::is_same_v<WeiLayout, ctc::GKYXC> ||
std::is_same_v<WeiLayout, ctc::GKZYXC>)
{
if(ConvC % GemmPipeline::GetVectorSizeB() != 0)
{
CK_TILE_ERROR("Conv C is not a multiple of vector load size for weight!");
return false;
}
}
else
{
CK_TILE_ERROR("Not supported weight layout!");
return false;
}
// check vector access of E
if constexpr(std::is_same_v<OutLayout, ctc::NWGK> ||
std::is_same_v<OutLayout, ctc::NHWGK> ||
std::is_same_v<OutLayout, ctc::NDHWGK>)
{
if(ConvK % EpiloguePipeline::GetVectorSizeC() != 0)
{
CK_TILE_ERROR("Conv K is not a multiple of vector store size for output image!");
return false;
}
}
else
{
CK_TILE_ERROR("Not supported output layout!");
return false;
}
return true;
}
template <memory_operation_enum DstInMemOp = memory_operation_enum::set>
CK_TILE_DEVICE static auto MakeGemmTensorViews(const InDataType* a_ptr,
const WeiDataType* b_ptr,
OutDataType* c_ptr,
const GroupedConvFwdKernelArgsSpecialized& kargs)
{
static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
static_assert(!TilePartitioner::BlockGemmShape::PermuteB, "Not implemented!");
const auto& a_tensor_view = [&]() {
return make_tensor_view<address_space_enum::global>(a_ptr, kargs.a_grid_desc_m_k);
}();
const auto& b_tensor_view = [&]() {
return make_tensor_view<address_space_enum::global>(b_ptr, kargs.b_grid_desc_n_k);
}();
// TODO: enable vector write for C in ColMajor
const auto& c_tensor_view = [&]() {
return make_tensor_view<address_space_enum::global>(c_ptr, kargs.c_grid_desc_m_n);
}();
return make_tuple(a_tensor_view, b_tensor_view, c_tensor_view);
}
template <typename TensorView>
CK_TILE_DEVICE static auto MakeGemmPadViews(const TensorView& views)
{
const auto& a_pad_view = [&]() {
const auto& a_tensor_view = views.at(I0);
return pad_tensor_view(a_tensor_view,
make_tuple(number<TilePartitioner::MPerBlock>{},
number<TilePartitioner::KPerBlock>{}),
sequence<true, true>{});
}();
const auto& b_pad_view = [&]() {
const auto& b_tensor_view = views.at(I1);
return pad_tensor_view(b_tensor_view,
make_tuple(number<TilePartitioner::NPerBlock>{},
number<TilePartitioner::KPerBlock>{}),
sequence<true, true>{});
}();
const auto& c_pad_view = [&]() {
const auto& c_tensor_view = views.at(I2);
return pad_tensor_view(c_tensor_view,
make_tuple(number<TilePartitioner::MPerBlock>{},
number<TilePartitioner::NPerBlock>{}),
sequence<true, true>{});
}();
return make_tuple(a_pad_view, b_pad_view, c_pad_view);
}
template <typename PadView>
CK_TILE_DEVICE static auto
MakeGemmTileWindows(const PadView& views, const index_t i_m, const index_t i_n)
{
const auto& a_pad_view = views.at(I0);
const auto& b_pad_view = views.at(I1);
const auto& c_pad_view = views.at(I2);
const auto& a_block_window = [&]() {
return make_tile_window(a_pad_view,
make_tuple(number<TilePartitioner::MPerBlock>{},
number<TilePartitioner::KPerBlock>{}),
{i_m, 0});
}();
const auto& b_block_window = [&]() {
return make_tile_window(b_pad_view,
make_tuple(number<TilePartitioner::NPerBlock>{},
number<TilePartitioner::KPerBlock>{}),
{i_n, 0});
}();
auto c_block_window = make_tile_window(
c_pad_view,
make_tuple(number<TilePartitioner::MPerBlock>{}, number<TilePartitioner::NPerBlock>{}),
{i_m, i_n});
return make_tuple(a_block_window, b_block_window, c_block_window);
}
/**
* @brief Runs single GEMM problem cooperatively by whole workgroup.
*
* @param a_ptr input A pointer
* @param b_ptr input B pointer
* @param c_ptr output C pointer
* @param smem_ptr_0 The start memory pointer of the shared memory block.
* @param kargs Grouped Convolution Forward kernel arguments
* @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
* @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
*
*/
CK_TILE_DEVICE static void RunGemm(const InDataType* a_ptr,
const WeiDataType* b_ptr,
OutDataType* c_ptr,
void* smem_ptr_0,
const GroupedConvFwdKernelArgsSpecialized& kargs,
const index_t block_idx_m,
const index_t block_idx_n)
{
// Create Gemm tensor views, pad views and tile windows
const auto& gemm_tensor_views_tuple =
MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(a_ptr, b_ptr, c_ptr, kargs);
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
const index_t num_loop =
__builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
// Run GEMM cooperatively by whole workgroup.
const auto& a_block_window = gemm_tile_windows.at(I0);
const auto& b_block_window = gemm_tile_windows.at(I1);
const auto& c_block_tile = GemmPipeline{}.template operator()(
a_block_window, b_block_window, num_loop, smem_ptr_0);
// Run Epilogue Pipeline
auto& c_block_window = gemm_tile_windows.at(I2);
EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
c_block_window, c_block_tile, smem_ptr_0);
}
/**
* @brief Runs single GEMM problem cooperatively by whole workgroup.
*
* @note RunGEMM2LDS in with two shared memory buffers using the ping pong buffer mechanism.
*
* @param a_ptr input A pointer
* @param b_ptr input B pointer
* @param c_ptr output C pointer
* @param smem_ptr_0 The starting pointer of 1st shared memory block.
* @param smem_ptr_1 The starting pointer of 2nd shared memory block.
* @param kargs Grouped Convolution Forward kernel arguments
* @param block_idx_m The GEMM's output M dimension tile index processed by this workgroup.
* @param block_idx_n The GEMM's output N dimension tile index processed by this workgroup.
*
*/
CK_TILE_DEVICE static void RunGemm2LDS(const InDataType* a_ptr,
const WeiDataType* b_ptr,
OutDataType* c_ptr,
void* __restrict__ smem_ptr_0,
void* __restrict__ smem_ptr_1,
const GroupedConvFwdKernelArgsSpecialized& kargs,
const index_t block_idx_m,
const index_t block_idx_n)
{
// Create Gemm tensor views, pad views and tile windows
const auto& gemm_tensor_views_tuple =
MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(a_ptr, b_ptr, c_ptr, kargs);
const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
auto gemm_tile_windows = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
const index_t num_loop =
__builtin_amdgcn_readfirstlane(TilePartitioner::GetLoopNum(kargs.GemmK));
// Run GEMM cooperatively by whole workgroup.
const auto& a_block_window = gemm_tile_windows.at(I0);
const auto& b_block_window = gemm_tile_windows.at(I1);
const auto& c_block_tile = GemmPipeline{}.template operator()(
a_block_window, b_block_window, num_loop, smem_ptr_0, smem_ptr_1);
// Run Epilogue Pipeline
auto& c_block_window = gemm_tile_windows.at(I2);
EpiloguePipeline{}.template operator()<decltype(c_block_window), decltype(c_block_tile)>(
c_block_window, c_block_tile, smem_ptr_0);
}
CK_TILE_DEVICE void operator()(GroupedConvFwdKernelArgsSpecialized kargs) const
{
const auto blockIdX = __builtin_amdgcn_readfirstlane(blockIdx.x);
const auto [iM, iN] =
TilePartitioner{kargs.GemmM, kargs.GemmN}.GetOutputTileIndex(blockIdX);
const index_t i_m = __builtin_amdgcn_readfirstlane(iM * TilePartitioner::MPerBlock);
const index_t i_n = __builtin_amdgcn_readfirstlane(iN * TilePartitioner::NPerBlock);
const auto blockIdY = __builtin_amdgcn_readfirstlane(blockIdx.y);
const auto group_offset_a = __builtin_amdgcn_readfirstlane(kargs.group_stride_a * blockIdY);
const auto group_offset_b = __builtin_amdgcn_readfirstlane(kargs.group_stride_b * blockIdY);
const auto group_offset_c = __builtin_amdgcn_readfirstlane(kargs.group_stride_c * blockIdY);
// options
const InDataType* a_ptr = static_cast<const InDataType*>(kargs.in_ptr) + group_offset_a;
const WeiDataType* b_ptr = static_cast<const WeiDataType*>(kargs.wei_ptr) + group_offset_b;
OutDataType* c_ptr = static_cast<OutDataType*>(kargs.out_ptr) + group_offset_c;
// allocate LDS
__shared__ char smem_ptr_0[GetSmemSize()];
if constexpr(GemmPipeline::DoubleSmemBuffer == true)
{
__shared__ char smem_ptr_1[GetSmemSize()];
if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
is_any_of<OutDataType, fp16_t, bf16_t>::value))
{
RunGemm2LDS(a_ptr, b_ptr, c_ptr, smem_ptr_0, smem_ptr_1, kargs, i_m, i_n);
}
}
else
{
if constexpr(!(EpiloguePipeline::MemoryOperation == memory_operation_enum::atomic_add &&
EpiloguePipeline::GetVectorSizeC() % 2 != 0 &&
is_any_of<OutDataType, fp16_t, bf16_t>::value))
{
RunGemm(a_ptr, b_ptr, c_ptr, smem_ptr_0, kargs, i_m, i_n);
}
}
}
};
} // namespace ck_tile

View File

@@ -0,0 +1,33 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <string>
namespace ck_tile {
enum struct ConvolutionForwardSpecialization
{
Default,
Filter1x1Pad0,
Filter1x1Stride1Pad0,
OddC,
Filter3x3,
};
CK_TILE_HOST std::string
getConvForwardSpecializationString(const ConvolutionForwardSpecialization& s)
{
switch(s)
{
case ConvolutionForwardSpecialization::Default: return "Default";
case ConvolutionForwardSpecialization::Filter1x1Pad0: return "Filter1x1Pad0";
case ConvolutionForwardSpecialization::Filter1x1Stride1Pad0: return "Filter1x1Stride1Pad0";
case ConvolutionForwardSpecialization::OddC: return "OddC";
case ConvolutionForwardSpecialization::Filter3x3: return "Filter3x3";
default: return "Unrecognized specialization!";
}
}
} // namespace ck_tile

View File

@@ -0,0 +1,46 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/host/convolution_parameter.hpp"
namespace ck_tile {
/// @brief The Grouped Conv kernel host arguments.
///
/// @par Overview
/// This structure is passed to Grouped Convolution Kernels when creating kernel
/// arguments object. It contain all necessary information required to
/// build proper kernel argument and launch kernel on GPU.
struct GroupedConvHostArgs : public conv::ConvParam
{
CK_TILE_HOST GroupedConvHostArgs() = delete;
CK_TILE_HOST GroupedConvHostArgs(ConvParam conv_param,
const void* in_ptr_,
const void* wei_ptr_,
void* out_ptr_,
index_t k_batch_)
: conv::ConvParam(conv_param),
in_ptr(in_ptr_),
wei_ptr(wei_ptr_),
out_ptr(out_ptr_),
k_batch(k_batch_)
{
}
const void* in_ptr;
const void* wei_ptr;
void* out_ptr;
index_t k_batch;
};
using GroupedConvImplicitGemmTraits = TileGemmTraits<true,
true,
true,
ck_tile::tensor_layout::gemm::RowMajor,
ck_tile::tensor_layout::gemm::ColumnMajor,
ck_tile::tensor_layout::gemm::RowMajor>;
} // namespace ck_tile

File diff suppressed because it is too large Load Diff