mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-06-30 03:37:38 +00:00
[CK_BUILDER] ALMIOPEN-522: Testing-specific descriptor initialization
Remove old CK host descriptor helper dependency from CK Builder testing framework and implement testing-owned descriptor computation. Core changes (ALMIOPEN-522): - Remove ck/library/utility/convolution_* includes from conv_fwd.hpp - Add ConvFwdProblem struct (testing-owned conv parameter description) - Add Args::make_conv_problem() (computes output spatial, no old CK dependency) - Rewrite make_input/weight/output_descriptor() with testing-specific stride mapping for all supported layouts (GNHWC, NHWGC, etc.) - Add optional explicit tensor-memory stride API (std::optional fields) - Update conv_fwd_ck.hpp and conv_fwd_reference.hpp to use make_conv_problem() - Remove to_ck_conv_param() method Additional improvements: - Add CK Tile forward EndToEnd support (conv_fwd_ck_tile.hpp + test) - Proves shared Args/descriptor design works across backends - Merge Create + EndToEnd tests into single file (matches old CK pattern) - Fix unit_validation.cpp BF16 initialization for reliable testing Results: - Old CK warnings removed (no more RowMajor/ColumnMajor spam) - All smoke tests pass (5/5) - Old CK EndToEnd passes (XDL vs GPU reference) - CK Tile EndToEnd passes (Tile vs GPU reference) Note: This PR conflicts with #3518 (tile-builder-testing). Both touch conv_fwd.hpp but with different approaches. This implementation directly addresses ALMIOPEN-522 requirements by removing old CK dependency.
This commit is contained in:
@@ -3,8 +3,11 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <array>
|
||||
#include <optional>
|
||||
#include <stdexcept>
|
||||
|
||||
#include "ck_tile/builder/conv_signature_concepts.hpp"
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp"
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
|
||||
#include "ck_tile/builder/testing/testing.hpp"
|
||||
#include "ck_tile/builder/testing/testing_reflect.hpp"
|
||||
@@ -13,8 +16,6 @@
|
||||
#include "ck_tile/builder/testing/tensor_initialization.hpp"
|
||||
#include "ck_tile/builder/testing/tensor_descriptor.hpp"
|
||||
#include "ck_tile/builder/testing/validation.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
|
||||
|
||||
/// This file implements common functionality for invoking/testing grouped
|
||||
/// forward convolutions created through the CK Builder API. The main item
|
||||
@@ -49,6 +50,124 @@ struct ConvTensorLengths
|
||||
FilterExtent<SPATIAL_DIM> filter = {}; // X, Y, Z
|
||||
};
|
||||
|
||||
/// @brief Convolution parameters derived from `Args`.
|
||||
///
|
||||
/// This structure contains all runtime convolution parameters needed by:
|
||||
/// - descriptor generation (output spatial sizes)
|
||||
/// - kernel invocation (stride/dilation/pads)
|
||||
///
|
||||
/// It intentionally does NOT depend on old CK utility types (e.g. ConvParam).
|
||||
template <int SPATIAL_DIM>
|
||||
struct ConvFwdProblem
|
||||
{
|
||||
int G = 1;
|
||||
int N = 1;
|
||||
int C = 1;
|
||||
int K = 1;
|
||||
|
||||
std::array<long_index_t, SPATIAL_DIM> input_spatial = {};
|
||||
std::array<long_index_t, SPATIAL_DIM> filter_spatial = {};
|
||||
std::array<long_index_t, SPATIAL_DIM> output_spatial = {};
|
||||
|
||||
std::array<long_index_t, SPATIAL_DIM> conv_strides = {};
|
||||
std::array<long_index_t, SPATIAL_DIM> conv_dilations = {};
|
||||
std::array<long_index_t, SPATIAL_DIM> left_pads = {};
|
||||
std::array<long_index_t, SPATIAL_DIM> right_pads = {};
|
||||
};
|
||||
|
||||
namespace detail {
|
||||
|
||||
/// @brief Calculate memory strides for a tensor with custom dimension ordering.
|
||||
///
|
||||
/// Given tensor dimensions and a memory layout order, compute the stride
|
||||
/// (memory jump size) needed to move by 1 in each dimension.
|
||||
///
|
||||
/// @param lengths Tensor dimensions (e.g., {3, 4} for 3 rows × 4 columns)
|
||||
/// @param outer_to_inner Dimension ordering from outermost to innermost in memory
|
||||
/// @return Strides for each dimension (e.g., {4, 1} for row-major 3×4 tensor)
|
||||
///
|
||||
/// Example: For a 3×4 tensor stored row-major (outer_to_inner = {0, 1}):
|
||||
/// - Moving 1 row down requires jumping 4 positions → stride[0] = 4
|
||||
/// - Moving 1 column right requires jumping 1 position → stride[1] = 1
|
||||
template <size_t RANK>
|
||||
Extent<RANK> make_packed_strides_for_order(const Extent<RANK>& lengths,
|
||||
const std::array<size_t, RANK>& outer_to_inner)
|
||||
{
|
||||
Extent<RANK> strides = {};
|
||||
|
||||
size_t stride = 1; // Innermost dimension always has stride 1
|
||||
for(size_t i = RANK; i > 0; --i)
|
||||
{
|
||||
const auto dim = outer_to_inner[i - 1]; // Get dimension at this position
|
||||
strides[dim] = stride; // Assign current stride
|
||||
stride *= lengths[dim]; // Update stride for next (outer) dimension
|
||||
}
|
||||
|
||||
return strides;
|
||||
}
|
||||
|
||||
template <int SPATIAL_DIM>
|
||||
std::array<long_index_t, SPATIAL_DIM> to_spatial_array(const FilterExtent<SPATIAL_DIM>& extent)
|
||||
{
|
||||
if constexpr(SPATIAL_DIM == 1)
|
||||
{
|
||||
return {static_cast<long_index_t>(extent.width)};
|
||||
}
|
||||
else if constexpr(SPATIAL_DIM == 2)
|
||||
{
|
||||
// CK Builder uses spatial ordering {H, W} for 2D.
|
||||
return {static_cast<long_index_t>(extent.height), static_cast<long_index_t>(extent.width)};
|
||||
}
|
||||
else
|
||||
{
|
||||
// CK Builder uses spatial ordering {D, H, W} for 3D.
|
||||
return {static_cast<long_index_t>(extent.depth),
|
||||
static_cast<long_index_t>(extent.height),
|
||||
static_cast<long_index_t>(extent.width)};
|
||||
}
|
||||
}
|
||||
|
||||
template <int SPATIAL_DIM>
|
||||
std::array<long_index_t, SPATIAL_DIM>
|
||||
compute_output_spatial(const std::array<long_index_t, SPATIAL_DIM>& input_spatial,
|
||||
const std::array<long_index_t, SPATIAL_DIM>& filter_spatial,
|
||||
const std::array<long_index_t, SPATIAL_DIM>& conv_strides,
|
||||
const std::array<long_index_t, SPATIAL_DIM>& conv_dilations,
|
||||
const std::array<long_index_t, SPATIAL_DIM>& left_pads,
|
||||
const std::array<long_index_t, SPATIAL_DIM>& right_pads)
|
||||
{
|
||||
std::array<long_index_t, SPATIAL_DIM> output_spatial = {};
|
||||
|
||||
for(int i = 0; i < SPATIAL_DIM; ++i)
|
||||
{
|
||||
const auto in = input_spatial[i];
|
||||
const auto fil = filter_spatial[i];
|
||||
const auto s = conv_strides[i];
|
||||
const auto d = conv_dilations[i];
|
||||
const auto pl = left_pads[i];
|
||||
const auto pr = right_pads[i];
|
||||
|
||||
// effective_filter = dilation*(filter-1) + 1
|
||||
const auto effective_filter = d * (fil - 1) + 1;
|
||||
const auto numerator = in + pl + pr - effective_filter;
|
||||
|
||||
if(s <= 0)
|
||||
{
|
||||
throw std::runtime_error("invalid convolution stride (must be > 0)");
|
||||
}
|
||||
if(numerator < 0)
|
||||
{
|
||||
throw std::runtime_error("invalid convolution parameters (negative output spatial)");
|
||||
}
|
||||
|
||||
output_spatial[i] = numerator / s + 1;
|
||||
}
|
||||
|
||||
return output_spatial;
|
||||
}
|
||||
|
||||
} // namespace detail
|
||||
|
||||
/// @brief `Args` specialization for forward convolution.
|
||||
///
|
||||
/// @tparam SIGNATURE Forward convolution signature.
|
||||
@@ -74,15 +193,16 @@ struct Args<SIGNATURE>
|
||||
// TODO: We shouldn't need to call into an internal namespace here.
|
||||
using Ops = factory::internal::ConvElementwiseOps<SIGNATURE>;
|
||||
|
||||
// TODO: We shouldn't need to call into an internal namespace here.
|
||||
using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
|
||||
ConvTensorLengths<SPATIAL_DIM> lengths;
|
||||
|
||||
// TODO: Tensor strides. This needs a new structure as well as some
|
||||
// reworking of the make_*_descriptor() functions, as the current
|
||||
// implementation (based on ConvParam in old CK / CK Tile) does not
|
||||
// support strides at all.
|
||||
// Optional explicit tensor-memory strides (in elements), for custom/non-packed tensors.
|
||||
// When not set, packed strides are derived automatically from the selected TensorLayout.
|
||||
// NOTE: These have explicit default initializers to avoid
|
||||
// -Wmissing-designated-field-initializers when `Args` is aggregate-initialized
|
||||
// using designated initializers in tests.
|
||||
std::optional<typename InputDescriptor::Extent> input_strides = std::nullopt;
|
||||
std::optional<typename WeightDescriptor::Extent> weight_strides = std::nullopt;
|
||||
std::optional<typename OutputDescriptor::Extent> output_strides = std::nullopt;
|
||||
|
||||
FilterExtent<SPATIAL_DIM> filter_strides;
|
||||
FilterExtent<SPATIAL_DIM> filter_dilation;
|
||||
@@ -93,23 +213,115 @@ struct Args<SIGNATURE>
|
||||
Ops::WeiElementwiseOp b_elementwise_op;
|
||||
Ops::OutElementwiseOp cde_elementwise_op;
|
||||
|
||||
/// @brief Convert `Args` into a testing-owned forward convolution problem description.
|
||||
ConvFwdProblem<SPATIAL_DIM> make_conv_problem() const
|
||||
{
|
||||
ConvFwdProblem<SPATIAL_DIM> problem;
|
||||
|
||||
problem.G = static_cast<int>(this->lengths.groups);
|
||||
problem.N = static_cast<int>(this->lengths.batch_size);
|
||||
problem.C = static_cast<int>(this->lengths.input_channels);
|
||||
problem.K = static_cast<int>(this->lengths.output_channels);
|
||||
|
||||
problem.input_spatial = detail::to_spatial_array<SPATIAL_DIM>(this->lengths.image);
|
||||
problem.filter_spatial = detail::to_spatial_array<SPATIAL_DIM>(this->lengths.filter);
|
||||
|
||||
problem.conv_strides = detail::to_spatial_array<SPATIAL_DIM>(this->filter_strides);
|
||||
problem.conv_dilations = detail::to_spatial_array<SPATIAL_DIM>(this->filter_dilation);
|
||||
problem.left_pads = detail::to_spatial_array<SPATIAL_DIM>(this->input_left_pad);
|
||||
problem.right_pads = detail::to_spatial_array<SPATIAL_DIM>(this->input_right_pad);
|
||||
|
||||
problem.output_spatial = detail::compute_output_spatial<SPATIAL_DIM>(problem.input_spatial,
|
||||
problem.filter_spatial,
|
||||
problem.conv_strides,
|
||||
problem.conv_dilations,
|
||||
problem.left_pads,
|
||||
problem.right_pads);
|
||||
|
||||
return problem;
|
||||
}
|
||||
|
||||
/// This function returns the `TensorDescriptor` corresponding to
|
||||
/// the input-tensor of the convolution problem. This can then
|
||||
/// be used to, for example, allocate memory.
|
||||
InputDescriptor make_input_descriptor() const
|
||||
{
|
||||
// TODO: We're using old CK functionality to compute the right
|
||||
// values here, mainly because CK tile does not support the
|
||||
// right tensor layouts here. We should probably change that
|
||||
// because CK currently prints an annoying message about it,
|
||||
// plus that would let us get rid of the `to_ck_conv_param()`
|
||||
// function.
|
||||
const auto param = to_ck_conv_param();
|
||||
const auto desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
|
||||
typename Layouts::InLayout>(param);
|
||||
using Extent = typename InputDescriptor::Extent;
|
||||
return InputDescriptor(Extent::from_vector(desc.GetLengths()),
|
||||
Extent::from_vector(desc.GetStrides()));
|
||||
Extent lens = {};
|
||||
|
||||
lens[0] = this->lengths.groups;
|
||||
lens[1] = this->lengths.batch_size;
|
||||
lens[2] = this->lengths.input_channels;
|
||||
if constexpr(SPATIAL_DIM == 1)
|
||||
{
|
||||
lens[3] = this->lengths.image.width;
|
||||
}
|
||||
else if constexpr(SPATIAL_DIM == 2)
|
||||
{
|
||||
lens[3] = this->lengths.image.height;
|
||||
lens[4] = this->lengths.image.width;
|
||||
}
|
||||
else
|
||||
{
|
||||
lens[3] = this->lengths.image.depth;
|
||||
lens[4] = this->lengths.image.height;
|
||||
lens[5] = this->lengths.image.width;
|
||||
}
|
||||
|
||||
const auto make_default_strides = [&] {
|
||||
constexpr auto layout = SIGNATURE.input.config.layout;
|
||||
|
||||
if constexpr(SPATIAL_DIM == 1)
|
||||
{
|
||||
if constexpr(layout == TensorLayout::GNCW)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3});
|
||||
else if constexpr(layout == TensorLayout::GNWC ||
|
||||
layout == TensorLayout::G_NW_C_strided)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2});
|
||||
else if constexpr(layout == TensorLayout::NWGC)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2});
|
||||
else if constexpr(layout == TensorLayout::NGCW)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {1, 0, 2, 3});
|
||||
else
|
||||
static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
|
||||
"Unsupported 1D input layout for descriptor initialization.");
|
||||
}
|
||||
else if constexpr(SPATIAL_DIM == 2)
|
||||
{
|
||||
if constexpr(layout == TensorLayout::GNCHW)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4});
|
||||
else if constexpr(layout == TensorLayout::GNHWC ||
|
||||
layout == TensorLayout::G_NHW_C_strided)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2});
|
||||
else if constexpr(layout == TensorLayout::NHWGC)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2});
|
||||
else if constexpr(layout == TensorLayout::NGCHW)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {1, 0, 2, 3, 4});
|
||||
else
|
||||
static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
|
||||
"Unsupported 2D input layout for descriptor initialization.");
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr(layout == TensorLayout::GNCDHW)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5});
|
||||
else if constexpr(layout == TensorLayout::GNDHWC ||
|
||||
layout == TensorLayout::G_NDHW_C_strided)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2});
|
||||
else if constexpr(layout == TensorLayout::NDHWGC)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2});
|
||||
else if constexpr(layout == TensorLayout::NGCDHW)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {1, 0, 2, 3, 4, 5});
|
||||
else
|
||||
static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
|
||||
"Unsupported 3D input layout for descriptor initialization.");
|
||||
}
|
||||
};
|
||||
|
||||
const Extent strides =
|
||||
this->input_strides.has_value() ? *this->input_strides : make_default_strides();
|
||||
|
||||
return InputDescriptor(lens, strides);
|
||||
}
|
||||
|
||||
/// This function returns the `TensorDescriptor` corresponding to
|
||||
@@ -117,13 +329,76 @@ struct Args<SIGNATURE>
|
||||
/// be used to, for example, allocate memory.
|
||||
WeightDescriptor make_weight_descriptor() const
|
||||
{
|
||||
// See note in implementation of `make_input_descriptor`.
|
||||
const auto param = to_ck_conv_param();
|
||||
const auto desc = ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<
|
||||
typename Layouts::WeiLayout>(param);
|
||||
using Extent = typename WeightDescriptor::Extent;
|
||||
return WeightDescriptor(Extent::from_vector(desc.GetLengths()),
|
||||
Extent::from_vector(desc.GetStrides()));
|
||||
Extent lens = {};
|
||||
|
||||
lens[0] = this->lengths.groups;
|
||||
lens[1] = this->lengths.output_channels;
|
||||
lens[2] = this->lengths.input_channels;
|
||||
if constexpr(SPATIAL_DIM == 1)
|
||||
{
|
||||
lens[3] = this->lengths.filter.width;
|
||||
}
|
||||
else if constexpr(SPATIAL_DIM == 2)
|
||||
{
|
||||
lens[3] = this->lengths.filter.height;
|
||||
lens[4] = this->lengths.filter.width;
|
||||
}
|
||||
else
|
||||
{
|
||||
lens[3] = this->lengths.filter.depth;
|
||||
lens[4] = this->lengths.filter.height;
|
||||
lens[5] = this->lengths.filter.width;
|
||||
}
|
||||
|
||||
const auto make_default_strides = [&] {
|
||||
constexpr auto layout = SIGNATURE.weight.config.layout;
|
||||
|
||||
if constexpr(SPATIAL_DIM == 1)
|
||||
{
|
||||
if constexpr(layout == TensorLayout::GKCX)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3});
|
||||
else if constexpr(layout == TensorLayout::GKXC ||
|
||||
layout == TensorLayout::G_K_X_C_strided)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2});
|
||||
else if constexpr(layout == TensorLayout::KXGC)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2});
|
||||
else
|
||||
static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
|
||||
"Unsupported 1D weight layout for descriptor initialization.");
|
||||
}
|
||||
else if constexpr(SPATIAL_DIM == 2)
|
||||
{
|
||||
if constexpr(layout == TensorLayout::GKCYX)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4});
|
||||
else if constexpr(layout == TensorLayout::GKYXC ||
|
||||
layout == TensorLayout::G_K_YX_C_strided)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2});
|
||||
else if constexpr(layout == TensorLayout::KYXGC)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2});
|
||||
else
|
||||
static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
|
||||
"Unsupported 2D weight layout for descriptor initialization.");
|
||||
}
|
||||
else
|
||||
{
|
||||
if constexpr(layout == TensorLayout::GKCZYX)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5});
|
||||
else if constexpr(layout == TensorLayout::GKZYXC ||
|
||||
layout == TensorLayout::G_K_ZYX_C_strided)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2});
|
||||
else if constexpr(layout == TensorLayout::KZYXGC)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2});
|
||||
else
|
||||
static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
|
||||
"Unsupported 3D weight layout for descriptor initialization.");
|
||||
}
|
||||
};
|
||||
|
||||
const Extent strides =
|
||||
this->weight_strides.has_value() ? *this->weight_strides : make_default_strides();
|
||||
|
||||
return WeightDescriptor(lens, strides);
|
||||
}
|
||||
|
||||
/// This function returns the `TensorDescriptor` corresponding to
|
||||
@@ -131,43 +406,84 @@ struct Args<SIGNATURE>
|
||||
/// be used to, for example, allocate memory.
|
||||
OutputDescriptor make_output_descriptor() const
|
||||
{
|
||||
// See note in implementation of `make_input_descriptor`.
|
||||
const auto param = to_ck_conv_param();
|
||||
const auto desc = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<
|
||||
typename Layouts::OutLayout>(param);
|
||||
using Extent = typename OutputDescriptor::Extent;
|
||||
return OutputDescriptor(Extent::from_vector(desc.GetLengths()),
|
||||
Extent::from_vector(desc.GetStrides()));
|
||||
}
|
||||
Extent lens = {};
|
||||
|
||||
const auto problem = make_conv_problem();
|
||||
|
||||
lens[0] = this->lengths.groups;
|
||||
lens[1] = this->lengths.batch_size;
|
||||
lens[2] = this->lengths.output_channels;
|
||||
if constexpr(SPATIAL_DIM == 1)
|
||||
{
|
||||
lens[3] = static_cast<size_t>(problem.output_spatial[0]);
|
||||
}
|
||||
else if constexpr(SPATIAL_DIM == 2)
|
||||
{
|
||||
lens[3] = static_cast<size_t>(problem.output_spatial[0]);
|
||||
lens[4] = static_cast<size_t>(problem.output_spatial[1]);
|
||||
}
|
||||
else
|
||||
{
|
||||
lens[3] = static_cast<size_t>(problem.output_spatial[0]);
|
||||
lens[4] = static_cast<size_t>(problem.output_spatial[1]);
|
||||
lens[5] = static_cast<size_t>(problem.output_spatial[2]);
|
||||
}
|
||||
|
||||
const auto make_default_strides = [&] {
|
||||
constexpr auto layout = SIGNATURE.output.config.layout;
|
||||
|
||||
/// Convert the Args structure into a CK conv_param structure. This
|
||||
/// function is mainly used to be able to use the existing
|
||||
/// CK-functionality to obtain tensor descriptors.
|
||||
ck::utils::conv::ConvParam to_ck_conv_param() const
|
||||
{
|
||||
const auto to_vector = [](const auto& extent) {
|
||||
if constexpr(SPATIAL_DIM == 1)
|
||||
return std::vector<ck::index_t>{ck::index_t(extent.width)};
|
||||
{
|
||||
if constexpr(layout == TensorLayout::GNKW)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3});
|
||||
else if constexpr(layout == TensorLayout::GNWK ||
|
||||
layout == TensorLayout::G_NW_K_strided)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2});
|
||||
else if constexpr(layout == TensorLayout::NWGK)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2});
|
||||
else if constexpr(layout == TensorLayout::NGKW)
|
||||
return detail::make_packed_strides_for_order<4>(lens, {1, 0, 2, 3});
|
||||
else
|
||||
static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
|
||||
"Unsupported 1D output layout for descriptor initialization.");
|
||||
}
|
||||
else if constexpr(SPATIAL_DIM == 2)
|
||||
return std::vector<ck::index_t>{ck::index_t(extent.height),
|
||||
ck::index_t(extent.width)};
|
||||
{
|
||||
if constexpr(layout == TensorLayout::GNKHW)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4});
|
||||
else if constexpr(layout == TensorLayout::GNHWK ||
|
||||
layout == TensorLayout::G_NHW_K_strided)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2});
|
||||
else if constexpr(layout == TensorLayout::NHWGK)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2});
|
||||
else if constexpr(layout == TensorLayout::NGKHW)
|
||||
return detail::make_packed_strides_for_order<5>(lens, {1, 0, 2, 3, 4});
|
||||
else
|
||||
static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
|
||||
"Unsupported 2D output layout for descriptor initialization.");
|
||||
}
|
||||
else
|
||||
return std::vector<ck::index_t>{ck::index_t(extent.depth),
|
||||
ck::index_t(extent.height),
|
||||
ck::index_t(extent.width)};
|
||||
{
|
||||
if constexpr(layout == TensorLayout::GNKDHW)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5});
|
||||
else if constexpr(layout == TensorLayout::GNDHWK ||
|
||||
layout == TensorLayout::G_NDHW_K_strided)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2});
|
||||
else if constexpr(layout == TensorLayout::NDHWGK)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2});
|
||||
else if constexpr(layout == TensorLayout::NGKDHW)
|
||||
return detail::make_packed_strides_for_order<6>(lens, {1, 0, 2, 3, 4, 5});
|
||||
else
|
||||
static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
|
||||
"Unsupported 3D output layout for descriptor initialization.");
|
||||
}
|
||||
};
|
||||
|
||||
return ck::utils::conv::ConvParam(SPATIAL_DIM,
|
||||
this->lengths.groups,
|
||||
this->lengths.batch_size,
|
||||
this->lengths.output_channels,
|
||||
this->lengths.input_channels,
|
||||
to_vector(this->lengths.filter),
|
||||
to_vector(this->lengths.image),
|
||||
to_vector(this->filter_strides),
|
||||
to_vector(this->filter_dilation),
|
||||
to_vector(this->input_left_pad),
|
||||
to_vector(this->input_right_pad));
|
||||
const Extent strides =
|
||||
this->output_strides.has_value() ? *this->output_strides : make_default_strides();
|
||||
|
||||
return OutputDescriptor(lens, strides);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
@@ -116,7 +116,7 @@ void run(CkConvInstance<SIGNATURE> auto& conv,
|
||||
return result;
|
||||
};
|
||||
|
||||
const auto param = args.to_ck_conv_param();
|
||||
const auto problem = args.make_conv_problem();
|
||||
|
||||
const auto input_desc = args.make_input_descriptor();
|
||||
const auto weight_desc = args.make_weight_descriptor();
|
||||
@@ -134,10 +134,10 @@ void run(CkConvInstance<SIGNATURE> auto& conv,
|
||||
{},
|
||||
to_ck_lengths(output_desc.get_lengths()),
|
||||
to_ck_lengths(output_desc.get_strides()),
|
||||
to_ck_extent(param.conv_filter_strides_),
|
||||
to_ck_extent(param.conv_filter_dilations_),
|
||||
to_ck_extent(param.input_left_pads_),
|
||||
to_ck_extent(param.input_right_pads_),
|
||||
to_ck_extent(problem.conv_strides),
|
||||
to_ck_extent(problem.conv_dilations),
|
||||
to_ck_extent(problem.left_pads),
|
||||
to_ck_extent(problem.right_pads),
|
||||
args.a_elementwise_op,
|
||||
args.b_elementwise_op,
|
||||
args.cde_elementwise_op);
|
||||
|
||||
@@ -0,0 +1,110 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/builder/testing/conv_fwd.hpp"
|
||||
#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
|
||||
#include "ck_tile/host/kernel_launch.hpp"
|
||||
#include <stdexcept>
|
||||
#include <type_traits>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
/// This file contains the implementation details for invoking/testing
|
||||
/// grouped forward convolution operations using CK Tile kernels.
|
||||
/// The main item is the `run()` function, which is used to invoke the
|
||||
/// CK Tile grouped convolution forward kernel instances.
|
||||
|
||||
namespace ck_tile::builder::test {
|
||||
|
||||
template <typename Conv, auto SIGNATURE>
|
||||
concept TileConvInstance = requires {
|
||||
typename Conv::CDElementwise;
|
||||
typename Conv::GroupedConvFwdKernelArgsSpecialized;
|
||||
{
|
||||
Conv::MakeKernelArgs(
|
||||
std::declval<ck_tile::GroupedConvFwdHostArgs<typename Conv::CDElementwise>>())
|
||||
} -> std::convertible_to<typename Conv::GroupedConvFwdKernelArgsSpecialized>;
|
||||
};
|
||||
|
||||
/// @brief `run()` specialization for forward convolution and CK Tile.
|
||||
///
|
||||
/// @tparam SIGNATURE Forward convolution signature.
|
||||
/// @throws std::runtime_error if the arguments weren't actually valid for the
|
||||
/// operation. This should be caught and reported by the testing framework.
|
||||
///
|
||||
/// @see run()
|
||||
template <auto SIGNATURE>
|
||||
requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
|
||||
void run(TileConvInstance<SIGNATURE> auto& conv,
|
||||
const Args<SIGNATURE>& args,
|
||||
const Inputs<SIGNATURE>& inputs,
|
||||
const Outputs<SIGNATURE>& outputs)
|
||||
{
|
||||
constexpr auto spatial_dim = SIGNATURE.spatial_dim;
|
||||
|
||||
// For now, CK Tile EndToEnd only supports packed tensors.
|
||||
// (Explicit custom strides are added as API in Args, but tile kernels
|
||||
// are not yet wired for that in the testing framework.)
|
||||
if(!args.make_input_descriptor().is_packed())
|
||||
throw std::runtime_error("TODO: Support non-packed input tensor in CK Tile runner");
|
||||
if(!args.make_weight_descriptor().is_packed())
|
||||
throw std::runtime_error("TODO: Support non-packed weight tensor in CK Tile runner");
|
||||
if(!args.make_output_descriptor().is_packed())
|
||||
throw std::runtime_error("TODO: Support non-packed output tensor in CK Tile runner");
|
||||
|
||||
const auto problem = args.make_conv_problem();
|
||||
|
||||
const std::vector<ck_tile::long_index_t> input_spatial(problem.input_spatial.begin(),
|
||||
problem.input_spatial.end());
|
||||
const std::vector<ck_tile::long_index_t> filter_spatial(problem.filter_spatial.begin(),
|
||||
problem.filter_spatial.end());
|
||||
const std::vector<ck_tile::long_index_t> conv_strides(problem.conv_strides.begin(),
|
||||
problem.conv_strides.end());
|
||||
const std::vector<ck_tile::long_index_t> conv_dilations(problem.conv_dilations.begin(),
|
||||
problem.conv_dilations.end());
|
||||
const std::vector<ck_tile::long_index_t> left_pads(problem.left_pads.begin(),
|
||||
problem.left_pads.end());
|
||||
const std::vector<ck_tile::long_index_t> right_pads(problem.right_pads.begin(),
|
||||
problem.right_pads.end());
|
||||
|
||||
// CK Tile host args are built around ck_tile::conv::ConvParam.
|
||||
ck_tile::conv::ConvParam conv_param(static_cast<ck_tile::long_index_t>(spatial_dim),
|
||||
problem.G,
|
||||
problem.N,
|
||||
problem.K,
|
||||
problem.C,
|
||||
filter_spatial,
|
||||
input_spatial,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
left_pads,
|
||||
right_pads);
|
||||
|
||||
using Kernel = std::remove_cvref_t<decltype(conv)>;
|
||||
using CDElementwise = typename Kernel::CDElementwise;
|
||||
|
||||
ck_tile::GroupedConvFwdHostArgs<CDElementwise> host_args(conv_param,
|
||||
inputs.input,
|
||||
inputs.weight,
|
||||
{},
|
||||
outputs.output,
|
||||
/*k_batch=*/1,
|
||||
CDElementwise{});
|
||||
|
||||
auto kargs = Kernel::MakeKernelArgs(host_args);
|
||||
|
||||
if(!Kernel::IsSupportedArgument(kargs))
|
||||
{
|
||||
throw std::runtime_error("invalid argument");
|
||||
}
|
||||
|
||||
const dim3 grids = Kernel::GridSize(kargs);
|
||||
const dim3 blocks = Kernel::BlockSize();
|
||||
|
||||
(void)ck_tile::launch_kernel(ck_tile::stream_config{},
|
||||
ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs));
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
@@ -74,16 +74,14 @@ void run(RefConvInstance<SIGNATURE> auto& conv,
|
||||
const Inputs<SIGNATURE>& inputs,
|
||||
const Outputs<SIGNATURE>& outputs)
|
||||
{
|
||||
// We don't want to compute the output dims manually, just get
|
||||
// them via the existing infrastructure
|
||||
const auto param = args.to_ck_conv_param();
|
||||
const auto problem = args.make_conv_problem();
|
||||
|
||||
// TODO: The reference convolution is currently missing a few features.
|
||||
// Just throw for now, but regard these as TODO items that should be resolved
|
||||
// eventually.
|
||||
|
||||
// Right pads are not supported right now for some reason.
|
||||
for(auto right_pad : param.input_right_pads_)
|
||||
for(auto right_pad : problem.right_pads)
|
||||
{
|
||||
if(right_pad != 0)
|
||||
throw std::runtime_error("TODO: Support right pad in reference conv");
|
||||
@@ -96,19 +94,31 @@ void run(RefConvInstance<SIGNATURE> auto& conv,
|
||||
if(!args.make_output_descriptor().is_packed())
|
||||
throw std::runtime_error("TODO: Support non-packed output tensor in reference conv");
|
||||
|
||||
const std::vector<long_index_t> input_spatial(problem.input_spatial.begin(),
|
||||
problem.input_spatial.end());
|
||||
const std::vector<long_index_t> filter_spatial(problem.filter_spatial.begin(),
|
||||
problem.filter_spatial.end());
|
||||
const std::vector<long_index_t> output_spatial(problem.output_spatial.begin(),
|
||||
problem.output_spatial.end());
|
||||
const std::vector<long_index_t> conv_strides(problem.conv_strides.begin(),
|
||||
problem.conv_strides.end());
|
||||
const std::vector<long_index_t> conv_dilations(problem.conv_dilations.begin(),
|
||||
problem.conv_dilations.end());
|
||||
const std::vector<long_index_t> left_pads(problem.left_pads.begin(), problem.left_pads.end());
|
||||
|
||||
conv.Run(inputs.input,
|
||||
inputs.weight,
|
||||
outputs.output,
|
||||
param.G_,
|
||||
param.N_,
|
||||
param.K_,
|
||||
param.C_,
|
||||
param.input_spatial_lengths_,
|
||||
param.filter_spatial_lengths_,
|
||||
param.output_spatial_lengths_,
|
||||
param.conv_filter_strides_,
|
||||
param.conv_filter_dilations_,
|
||||
param.input_left_pads_);
|
||||
problem.G,
|
||||
problem.N,
|
||||
problem.K,
|
||||
problem.C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
conv_strides,
|
||||
conv_dilations,
|
||||
left_pads);
|
||||
}
|
||||
|
||||
} // namespace ck_tile::builder::test
|
||||
|
||||
@@ -3,33 +3,46 @@
|
||||
|
||||
#include "utils/ckb_conv_tile_test_configs.hpp"
|
||||
#include "utils/ckb_conv_test_utils.hpp"
|
||||
#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp"
|
||||
#include "ck_tile/builder/testing/conv_fwd_reference.hpp"
|
||||
#include "ck_tile/host/device_prop.hpp"
|
||||
#include "testing_utils.hpp"
|
||||
|
||||
namespace {
|
||||
namespace ckb = ck_tile::builder;
|
||||
namespace ckt = ck_tile::builder::test;
|
||||
namespace cku = ck_tile::builder::test_utils;
|
||||
|
||||
using namespace ck_tile::builder::test_utils;
|
||||
using ck_tile::test::MatchesReference;
|
||||
|
||||
TEST(FwdConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_2D_FP16_NHWGC)
|
||||
constexpr auto SIGNATURE =
|
||||
ckt::ConvSignature{.spatial_dim = 2,
|
||||
.direction = ckb::ConvDirection::FORWARD,
|
||||
.data_type = ckb::DataType::FP16,
|
||||
.accumulation_data_type = ckb::DataType::FP32,
|
||||
.input = {.config = {.layout = ckb::TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = ckb::TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto TILE_ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{}
|
||||
.with_tile_specializations(ckb::TileConvSpecialization::DEFAULT)
|
||||
.with_tile_thread_block(cku::TileThreadBlock_64x64x64)
|
||||
.with_tile_block_gemm(cku::TileBlockGemmDesc_16x16_v3_intrawave)
|
||||
.with_tile_transfer(cku::TileTransfer_4x4x4)
|
||||
.with_tile_optimizations(ckt::TileOptimizations{
|
||||
.num_groups_to_merge = 1,
|
||||
.split_image = false,
|
||||
.explicit_gemm = false,
|
||||
});
|
||||
|
||||
using Builder = ckb::ConvBuilder<SIGNATURE, TILE_ALGORITHM>;
|
||||
using TileConv = Builder::Instance;
|
||||
using Reference = ckb::ConvBuilder<SIGNATURE, ckt::ConvAlgorithm_Reference{}>::Instance;
|
||||
|
||||
TEST(Fwd2DFp16_TileV3_NHWGC, Create)
|
||||
{
|
||||
constexpr ConvSignature FwdConvSignature{.spatial_dim = 2,
|
||||
.direction = ConvDirection::FORWARD,
|
||||
.data_type = DataType::FP16,
|
||||
.accumulation_data_type = DataType::FP32,
|
||||
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto FwdConvAlgorithm =
|
||||
ConvAlgorithm_Tile_GroupedConvolutionKernel{}
|
||||
.with_tile_specializations(TileConvSpecialization::DEFAULT)
|
||||
.with_tile_thread_block(TileThreadBlock_64x64x64)
|
||||
.with_tile_block_gemm(TileBlockGemmDesc_16x16_v3_intrawave)
|
||||
.with_tile_transfer(TileTransfer_4x4x4)
|
||||
.with_tile_optimizations(TileOptimizations{
|
||||
.num_groups_to_merge = 1, .split_image = false, .explicit_gemm = false});
|
||||
|
||||
using Builder = ConvBuilder<FwdConvSignature, FwdConvAlgorithm>;
|
||||
run_ck_tile_test<Builder>({
|
||||
"grouped_convolution_forward",
|
||||
const auto expected_type_string = "grouped_convolution_forward";
|
||||
cku::run_ck_tile_test<Builder>({
|
||||
expected_type_string,
|
||||
"fp16",
|
||||
"NHWGC_GKYXC_NHWGK",
|
||||
"64x64x64",
|
||||
@@ -48,4 +61,51 @@ TEST(FwdConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_2D_FP1
|
||||
});
|
||||
}
|
||||
|
||||
} // namespace
|
||||
TEST(Fwd2DFp16_TileV3_NHWGC, EndToEnd)
|
||||
{
|
||||
if(!ck_tile::get_device_name().starts_with("gfx9"))
|
||||
{
|
||||
GTEST_SKIP() << "unsupported architecture";
|
||||
}
|
||||
|
||||
ckt::Args<SIGNATURE> args = {
|
||||
.lengths =
|
||||
{
|
||||
.batch_size = 16,
|
||||
.groups = 1,
|
||||
.input_channels = 32,
|
||||
.output_channels = 48,
|
||||
.image =
|
||||
{
|
||||
.width = 56,
|
||||
.height = 64,
|
||||
},
|
||||
.filter =
|
||||
{
|
||||
.width = 3,
|
||||
.height = 5,
|
||||
},
|
||||
},
|
||||
.filter_strides = {.width = 1, .height = 1},
|
||||
.filter_dilation = {.width = 1, .height = 1},
|
||||
.input_left_pad = {.width = 0, .height = 0},
|
||||
.input_right_pad = {.width = 0, .height = 0},
|
||||
.a_elementwise_op = {},
|
||||
.b_elementwise_op = {},
|
||||
.cde_elementwise_op = {},
|
||||
};
|
||||
|
||||
auto inputs = ckt::alloc_inputs(args);
|
||||
auto outputs = ckt::alloc_outputs(args);
|
||||
auto reference = ckt::alloc_outputs(args);
|
||||
|
||||
ckt::init_inputs(args, inputs.get());
|
||||
|
||||
auto tile_conv = TileConv{};
|
||||
ckt::run(tile_conv, args, inputs.get(), outputs.get());
|
||||
|
||||
auto ref_conv = Reference{};
|
||||
ckt::run(ref_conv, args, inputs.get(), reference.get());
|
||||
|
||||
EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get()));
|
||||
}
|
||||
|
||||
@@ -142,10 +142,12 @@ TEST(ValidationReportTests, MultipleSomeIncorrect)
|
||||
auto a = ckt::alloc_tensor_buffer(desc);
|
||||
auto b = ckt::alloc_tensor_buffer(desc);
|
||||
|
||||
ckt::fill_tensor_buffer(
|
||||
desc, a.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 100); });
|
||||
ckt::fill_tensor_buffer(
|
||||
desc, b.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 101); });
|
||||
ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) {
|
||||
return ck::type_convert<ck::bhalf_t>(static_cast<float>(i % 100));
|
||||
});
|
||||
ckt::fill_tensor_buffer(desc, b.get(), [](size_t i) {
|
||||
return ck::type_convert<ck::bhalf_t>(static_cast<float>(i % 101));
|
||||
});
|
||||
|
||||
report.check("incorrect 1", desc, b.get(), a.get());
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user