Files
composable_kernel/experimental/grouped_convolution_tile_instances/include/signatures.hpp
Bartłomiej Kocot 067e5e0ca4 [rocm-libraries] ROCm/rocm-libraries#6838 (commit ff7a665)
[CK_TILE] Add depthwise conv2d forward kernel (FP16/FP32) (#6838)

## Motivation

CK currently has no kernel optimized for depthwise convolution
(G=C_in=C_out, C=K=1 per group) and existing generic paths perform
poorly for this workload. This PR adds a dedicated depthwise conv
forward kernel in CK Tile.

## Technical Details

Adds a dedicated depthwise conv2d forward op to CK Tile that performs
direct convolution rather than falling back to the generic GEMM path.
The kernel is templatized by filter size, stride, and data type, and
compiled into ~60 instances covering common configurations (kernel
3/5/7/9, stride 1/2, FP16/FP32). Supports both CDNA (gfx942/gfx950) and
RDNA (gfx1100/gfx1200) architectures.

## Test Plan

- [x] Correctness and performance validated on gfx942, gfx950, and
gfx1100, with ckProfiler `grouped_conv_fwd` as baseline.
- [ ] MI300A (gfx942) and gfx1200 validation.

## Submission Checklist

- [x ] Look over the contributing guidelines at
https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests.
AICK-1137

---------

Co-authored-by: GenDu <Gen.Du@amd.com>
2026-05-15 15:47:55 +02:00

217 lines
13 KiB
C++

// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include <tuple>
#include "../../builder/test/impl/conv_signature_types.hpp"
#include "ck_tile/builder/testing/conv/ck_tile.hpp"
namespace ck_tile::builder::profiling {
namespace ckb = ck_tile::builder;
namespace ckt = ck_tile::builder::test;
constexpr auto SIGNATURE_NHWGC_FP32_FWD =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::FORWARD,
.data_type = ckb::DataType::FP32,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
constexpr auto SIGNATURE_NHWGC_BF16_FWD =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::FORWARD,
.data_type = ckb::DataType::BF16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
constexpr auto SIGNATURE_NHWGC_FP16_FWD =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::FORWARD,
.data_type = ckb::DataType::FP16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
constexpr auto SIGNATURE_NDHWGC_FP32_FWD =
ckt::ConvSignature{.spatial_dim = 3,
.direction = ckb::ConvDirection::FORWARD,
.data_type = ckb::DataType::FP32,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
constexpr auto SIGNATURE_NDHWGC_BF16_FWD =
ckt::ConvSignature{.spatial_dim = 3,
.direction = ckb::ConvDirection::FORWARD,
.data_type = ckb::DataType::BF16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
constexpr auto SIGNATURE_NDHWGC_FP16_FWD =
ckt::ConvSignature{.spatial_dim = 3,
.direction = ckb::ConvDirection::FORWARD,
.data_type = ckb::DataType::FP16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
/////////////////////////////////////////
// FWD signatures (NGCHW / NGCDHW)
//////////////////////////////////////////
constexpr auto SIGNATURE_NGCHW_FP32_FWD =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::FORWARD,
.data_type = ckb::DataType::FP32,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NGCHW}},
.weight = {.config = {.layout = ckb::TensorLayout::GKCYX}},
.output = {.config = {.layout = ckb::TensorLayout::NGKHW}}};
constexpr auto SIGNATURE_NGCHW_FP16_FWD =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::FORWARD,
.data_type = ckb::DataType::FP16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NGCHW}},
.weight = {.config = {.layout = ckb::TensorLayout::GKCYX}},
.output = {.config = {.layout = ckb::TensorLayout::NGKHW}}};
constexpr auto SIGNATURE_NGCHW_BF16_FWD =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::FORWARD,
.data_type = ckb::DataType::BF16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NGCHW}},
.weight = {.config = {.layout = ckb::TensorLayout::GKCYX}},
.output = {.config = {.layout = ckb::TensorLayout::NGKHW}}};
/////////////////////////////////////////
// BWD WEIGHT signatures
//////////////////////////////////////////
constexpr auto SIGNATURE_NHWGC_BF16_BWD_WEIGHT =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
.data_type = ckb::DataType::BF16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
constexpr auto SIGNATURE_NHWGC_FP16_BWD_WEIGHT =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
.data_type = ckb::DataType::FP16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
constexpr auto SIGNATURE_NHWGC_FP32_BWD_WEIGHT =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
.data_type = ckb::DataType::FP32,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
constexpr auto SIGNATURE_NDHWGC_BF16_BWD_WEIGHT =
ckt::ConvSignature{.spatial_dim = 3,
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
.data_type = ckb::DataType::BF16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
constexpr auto SIGNATURE_NDHWGC_FP16_BWD_WEIGHT =
ckt::ConvSignature{.spatial_dim = 3,
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
.data_type = ckb::DataType::FP16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
constexpr auto SIGNATURE_NDHWGC_FP32_BWD_WEIGHT =
ckt::ConvSignature{.spatial_dim = 3,
.direction = ckb::ConvDirection::BACKWARD_WEIGHT,
.data_type = ckb::DataType::FP32,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
/////////////////////////////////////////
// BWD DATA signatures
//////////////////////////////////////////
constexpr auto SIGNATURE_NHWGC_BF16_BWD_DATA =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::BACKWARD_DATA,
.data_type = ckb::DataType::BF16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
constexpr auto SIGNATURE_NHWGC_FP16_BWD_DATA =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::BACKWARD_DATA,
.data_type = ckb::DataType::FP16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
constexpr auto SIGNATURE_NHWGC_FP32_BWD_DATA =
ckt::ConvSignature{.spatial_dim = 2,
.direction = ckb::ConvDirection::BACKWARD_DATA,
.data_type = ckb::DataType::FP32,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
constexpr auto SIGNATURE_NDHWGC_BF16_BWD_DATA =
ckt::ConvSignature{.spatial_dim = 3,
.direction = ckb::ConvDirection::BACKWARD_DATA,
.data_type = ckb::DataType::BF16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
constexpr auto SIGNATURE_NDHWGC_FP16_BWD_DATA =
ckt::ConvSignature{.spatial_dim = 3,
.direction = ckb::ConvDirection::BACKWARD_DATA,
.data_type = ckb::DataType::FP16,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
constexpr auto SIGNATURE_NDHWGC_FP32_BWD_DATA =
ckt::ConvSignature{.spatial_dim = 3,
.direction = ckb::ConvDirection::BACKWARD_DATA,
.data_type = ckb::DataType::FP32,
.accumulation_data_type = ckb::DataType::FP32,
.input = {.config = {.layout = ckb::TensorLayout::NDHWGC}},
.weight = {.config = {.layout = ckb::TensorLayout::GKZYXC}},
.output = {.config = {.layout = ckb::TensorLayout::NDHWGK}}};
} // namespace ck_tile::builder::profiling