From ae55803b39d3e4dfda13e0180d8a51cefb68aaa2 Mon Sep 17 00:00:00 2001 From: JH-Leon-KIM-AMD Date: Wed, 14 Jan 2026 12:16:25 +0000 Subject: [PATCH] [CK_BUILDER] ALMIOPEN-522: Testing-specific descriptor initialization Remove old CK host descriptor helper dependency from CK Builder testing framework and implement testing-owned descriptor computation. Core changes (ALMIOPEN-522): - Remove ck/library/utility/convolution_* includes from conv_fwd.hpp - Add ConvFwdProblem struct (testing-owned conv parameter description) - Add Args::make_conv_problem() (computes output spatial, no old CK dependency) - Rewrite make_input/weight/output_descriptor() with testing-specific stride mapping for all supported layouts (GNHWC, NHWGC, etc.) - Add optional explicit tensor-memory stride API (std::optional fields) - Update conv_fwd_ck.hpp and conv_fwd_reference.hpp to use make_conv_problem() - Remove to_ck_conv_param() method Additional improvements: - Add CK Tile forward EndToEnd support (conv_fwd_ck_tile.hpp + test) - Proves shared Args/descriptor design works across backends - Merge Create + EndToEnd tests into single file (matches old CK pattern) - Fix unit_validation.cpp BF16 initialization for reliable testing Results: - Old CK warnings removed (no more RowMajor/ColumnMajor spam) - All smoke tests pass (5/5) - Old CK EndToEnd passes (XDL vs GPU reference) - CK Tile EndToEnd passes (Tile vs GPU reference) Note: This PR conflicts with #3518 (tile-builder-testing). Both touch conv_fwd.hpp but with different approaches. This implementation directly addresses ALMIOPEN-522 requirements by removing old CK dependency. --- .../ck_tile/builder/testing/conv_fwd.hpp | 430 +++++++++++++++--- .../ck_tile/builder/testing/conv_fwd_ck.hpp | 10 +- .../builder/testing/conv_fwd_ck_tile.hpp | 110 +++++ .../builder/testing/conv_fwd_reference.hpp | 38 +- .../ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp | 108 ++++- experimental/builder/test/unit_validation.cpp | 10 +- 6 files changed, 602 insertions(+), 104 deletions(-) create mode 100644 experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp index d8910152dd..44596a4d2d 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp @@ -3,8 +3,11 @@ #pragma once +#include +#include +#include + #include "ck_tile/builder/conv_signature_concepts.hpp" -#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp" #include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp" #include "ck_tile/builder/testing/testing.hpp" #include "ck_tile/builder/testing/testing_reflect.hpp" @@ -13,8 +16,6 @@ #include "ck_tile/builder/testing/tensor_initialization.hpp" #include "ck_tile/builder/testing/tensor_descriptor.hpp" #include "ck_tile/builder/testing/validation.hpp" -#include "ck/library/utility/convolution_parameter.hpp" -#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp" /// This file implements common functionality for invoking/testing grouped /// forward convolutions created through the CK Builder API. The main item @@ -49,6 +50,124 @@ struct ConvTensorLengths FilterExtent filter = {}; // X, Y, Z }; +/// @brief Convolution parameters derived from `Args`. +/// +/// This structure contains all runtime convolution parameters needed by: +/// - descriptor generation (output spatial sizes) +/// - kernel invocation (stride/dilation/pads) +/// +/// It intentionally does NOT depend on old CK utility types (e.g. ConvParam). +template +struct ConvFwdProblem +{ + int G = 1; + int N = 1; + int C = 1; + int K = 1; + + std::array input_spatial = {}; + std::array filter_spatial = {}; + std::array output_spatial = {}; + + std::array conv_strides = {}; + std::array conv_dilations = {}; + std::array left_pads = {}; + std::array right_pads = {}; +}; + +namespace detail { + +/// @brief Calculate memory strides for a tensor with custom dimension ordering. +/// +/// Given tensor dimensions and a memory layout order, compute the stride +/// (memory jump size) needed to move by 1 in each dimension. +/// +/// @param lengths Tensor dimensions (e.g., {3, 4} for 3 rows × 4 columns) +/// @param outer_to_inner Dimension ordering from outermost to innermost in memory +/// @return Strides for each dimension (e.g., {4, 1} for row-major 3×4 tensor) +/// +/// Example: For a 3×4 tensor stored row-major (outer_to_inner = {0, 1}): +/// - Moving 1 row down requires jumping 4 positions → stride[0] = 4 +/// - Moving 1 column right requires jumping 1 position → stride[1] = 1 +template +Extent make_packed_strides_for_order(const Extent& lengths, + const std::array& outer_to_inner) +{ + Extent strides = {}; + + size_t stride = 1; // Innermost dimension always has stride 1 + for(size_t i = RANK; i > 0; --i) + { + const auto dim = outer_to_inner[i - 1]; // Get dimension at this position + strides[dim] = stride; // Assign current stride + stride *= lengths[dim]; // Update stride for next (outer) dimension + } + + return strides; +} + +template +std::array to_spatial_array(const FilterExtent& extent) +{ + if constexpr(SPATIAL_DIM == 1) + { + return {static_cast(extent.width)}; + } + else if constexpr(SPATIAL_DIM == 2) + { + // CK Builder uses spatial ordering {H, W} for 2D. + return {static_cast(extent.height), static_cast(extent.width)}; + } + else + { + // CK Builder uses spatial ordering {D, H, W} for 3D. + return {static_cast(extent.depth), + static_cast(extent.height), + static_cast(extent.width)}; + } +} + +template +std::array +compute_output_spatial(const std::array& input_spatial, + const std::array& filter_spatial, + const std::array& conv_strides, + const std::array& conv_dilations, + const std::array& left_pads, + const std::array& right_pads) +{ + std::array output_spatial = {}; + + for(int i = 0; i < SPATIAL_DIM; ++i) + { + const auto in = input_spatial[i]; + const auto fil = filter_spatial[i]; + const auto s = conv_strides[i]; + const auto d = conv_dilations[i]; + const auto pl = left_pads[i]; + const auto pr = right_pads[i]; + + // effective_filter = dilation*(filter-1) + 1 + const auto effective_filter = d * (fil - 1) + 1; + const auto numerator = in + pl + pr - effective_filter; + + if(s <= 0) + { + throw std::runtime_error("invalid convolution stride (must be > 0)"); + } + if(numerator < 0) + { + throw std::runtime_error("invalid convolution parameters (negative output spatial)"); + } + + output_spatial[i] = numerator / s + 1; + } + + return output_spatial; +} + +} // namespace detail + /// @brief `Args` specialization for forward convolution. /// /// @tparam SIGNATURE Forward convolution signature. @@ -74,15 +193,16 @@ struct Args // TODO: We shouldn't need to call into an internal namespace here. using Ops = factory::internal::ConvElementwiseOps; - // TODO: We shouldn't need to call into an internal namespace here. - using Layouts = factory::internal::ConvTensorLayouts; - ConvTensorLengths lengths; - // TODO: Tensor strides. This needs a new structure as well as some - // reworking of the make_*_descriptor() functions, as the current - // implementation (based on ConvParam in old CK / CK Tile) does not - // support strides at all. + // Optional explicit tensor-memory strides (in elements), for custom/non-packed tensors. + // When not set, packed strides are derived automatically from the selected TensorLayout. + // NOTE: These have explicit default initializers to avoid + // -Wmissing-designated-field-initializers when `Args` is aggregate-initialized + // using designated initializers in tests. + std::optional input_strides = std::nullopt; + std::optional weight_strides = std::nullopt; + std::optional output_strides = std::nullopt; FilterExtent filter_strides; FilterExtent filter_dilation; @@ -93,23 +213,115 @@ struct Args Ops::WeiElementwiseOp b_elementwise_op; Ops::OutElementwiseOp cde_elementwise_op; + /// @brief Convert `Args` into a testing-owned forward convolution problem description. + ConvFwdProblem make_conv_problem() const + { + ConvFwdProblem problem; + + problem.G = static_cast(this->lengths.groups); + problem.N = static_cast(this->lengths.batch_size); + problem.C = static_cast(this->lengths.input_channels); + problem.K = static_cast(this->lengths.output_channels); + + problem.input_spatial = detail::to_spatial_array(this->lengths.image); + problem.filter_spatial = detail::to_spatial_array(this->lengths.filter); + + problem.conv_strides = detail::to_spatial_array(this->filter_strides); + problem.conv_dilations = detail::to_spatial_array(this->filter_dilation); + problem.left_pads = detail::to_spatial_array(this->input_left_pad); + problem.right_pads = detail::to_spatial_array(this->input_right_pad); + + problem.output_spatial = detail::compute_output_spatial(problem.input_spatial, + problem.filter_spatial, + problem.conv_strides, + problem.conv_dilations, + problem.left_pads, + problem.right_pads); + + return problem; + } + /// This function returns the `TensorDescriptor` corresponding to /// the input-tensor of the convolution problem. This can then /// be used to, for example, allocate memory. InputDescriptor make_input_descriptor() const { - // TODO: We're using old CK functionality to compute the right - // values here, mainly because CK tile does not support the - // right tensor layouts here. We should probably change that - // because CK currently prints an annoying message about it, - // plus that would let us get rid of the `to_ck_conv_param()` - // function. - const auto param = to_ck_conv_param(); - const auto desc = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed< - typename Layouts::InLayout>(param); using Extent = typename InputDescriptor::Extent; - return InputDescriptor(Extent::from_vector(desc.GetLengths()), - Extent::from_vector(desc.GetStrides())); + Extent lens = {}; + + lens[0] = this->lengths.groups; + lens[1] = this->lengths.batch_size; + lens[2] = this->lengths.input_channels; + if constexpr(SPATIAL_DIM == 1) + { + lens[3] = this->lengths.image.width; + } + else if constexpr(SPATIAL_DIM == 2) + { + lens[3] = this->lengths.image.height; + lens[4] = this->lengths.image.width; + } + else + { + lens[3] = this->lengths.image.depth; + lens[4] = this->lengths.image.height; + lens[5] = this->lengths.image.width; + } + + const auto make_default_strides = [&] { + constexpr auto layout = SIGNATURE.input.config.layout; + + if constexpr(SPATIAL_DIM == 1) + { + if constexpr(layout == TensorLayout::GNCW) + return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3}); + else if constexpr(layout == TensorLayout::GNWC || + layout == TensorLayout::G_NW_C_strided) + return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2}); + else if constexpr(layout == TensorLayout::NWGC) + return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2}); + else if constexpr(layout == TensorLayout::NGCW) + return detail::make_packed_strides_for_order<4>(lens, {1, 0, 2, 3}); + else + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Unsupported 1D input layout for descriptor initialization."); + } + else if constexpr(SPATIAL_DIM == 2) + { + if constexpr(layout == TensorLayout::GNCHW) + return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4}); + else if constexpr(layout == TensorLayout::GNHWC || + layout == TensorLayout::G_NHW_C_strided) + return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2}); + else if constexpr(layout == TensorLayout::NHWGC) + return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2}); + else if constexpr(layout == TensorLayout::NGCHW) + return detail::make_packed_strides_for_order<5>(lens, {1, 0, 2, 3, 4}); + else + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Unsupported 2D input layout for descriptor initialization."); + } + else + { + if constexpr(layout == TensorLayout::GNCDHW) + return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5}); + else if constexpr(layout == TensorLayout::GNDHWC || + layout == TensorLayout::G_NDHW_C_strided) + return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2}); + else if constexpr(layout == TensorLayout::NDHWGC) + return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2}); + else if constexpr(layout == TensorLayout::NGCDHW) + return detail::make_packed_strides_for_order<6>(lens, {1, 0, 2, 3, 4, 5}); + else + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Unsupported 3D input layout for descriptor initialization."); + } + }; + + const Extent strides = + this->input_strides.has_value() ? *this->input_strides : make_default_strides(); + + return InputDescriptor(lens, strides); } /// This function returns the `TensorDescriptor` corresponding to @@ -117,13 +329,76 @@ struct Args /// be used to, for example, allocate memory. WeightDescriptor make_weight_descriptor() const { - // See note in implementation of `make_input_descriptor`. - const auto param = to_ck_conv_param(); - const auto desc = ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed< - typename Layouts::WeiLayout>(param); using Extent = typename WeightDescriptor::Extent; - return WeightDescriptor(Extent::from_vector(desc.GetLengths()), - Extent::from_vector(desc.GetStrides())); + Extent lens = {}; + + lens[0] = this->lengths.groups; + lens[1] = this->lengths.output_channels; + lens[2] = this->lengths.input_channels; + if constexpr(SPATIAL_DIM == 1) + { + lens[3] = this->lengths.filter.width; + } + else if constexpr(SPATIAL_DIM == 2) + { + lens[3] = this->lengths.filter.height; + lens[4] = this->lengths.filter.width; + } + else + { + lens[3] = this->lengths.filter.depth; + lens[4] = this->lengths.filter.height; + lens[5] = this->lengths.filter.width; + } + + const auto make_default_strides = [&] { + constexpr auto layout = SIGNATURE.weight.config.layout; + + if constexpr(SPATIAL_DIM == 1) + { + if constexpr(layout == TensorLayout::GKCX) + return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3}); + else if constexpr(layout == TensorLayout::GKXC || + layout == TensorLayout::G_K_X_C_strided) + return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2}); + else if constexpr(layout == TensorLayout::KXGC) + return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2}); + else + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Unsupported 1D weight layout for descriptor initialization."); + } + else if constexpr(SPATIAL_DIM == 2) + { + if constexpr(layout == TensorLayout::GKCYX) + return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4}); + else if constexpr(layout == TensorLayout::GKYXC || + layout == TensorLayout::G_K_YX_C_strided) + return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2}); + else if constexpr(layout == TensorLayout::KYXGC) + return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2}); + else + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Unsupported 2D weight layout for descriptor initialization."); + } + else + { + if constexpr(layout == TensorLayout::GKCZYX) + return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5}); + else if constexpr(layout == TensorLayout::GKZYXC || + layout == TensorLayout::G_K_ZYX_C_strided) + return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2}); + else if constexpr(layout == TensorLayout::KZYXGC) + return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2}); + else + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Unsupported 3D weight layout for descriptor initialization."); + } + }; + + const Extent strides = + this->weight_strides.has_value() ? *this->weight_strides : make_default_strides(); + + return WeightDescriptor(lens, strides); } /// This function returns the `TensorDescriptor` corresponding to @@ -131,43 +406,84 @@ struct Args /// be used to, for example, allocate memory. OutputDescriptor make_output_descriptor() const { - // See note in implementation of `make_input_descriptor`. - const auto param = to_ck_conv_param(); - const auto desc = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed< - typename Layouts::OutLayout>(param); using Extent = typename OutputDescriptor::Extent; - return OutputDescriptor(Extent::from_vector(desc.GetLengths()), - Extent::from_vector(desc.GetStrides())); - } + Extent lens = {}; + + const auto problem = make_conv_problem(); + + lens[0] = this->lengths.groups; + lens[1] = this->lengths.batch_size; + lens[2] = this->lengths.output_channels; + if constexpr(SPATIAL_DIM == 1) + { + lens[3] = static_cast(problem.output_spatial[0]); + } + else if constexpr(SPATIAL_DIM == 2) + { + lens[3] = static_cast(problem.output_spatial[0]); + lens[4] = static_cast(problem.output_spatial[1]); + } + else + { + lens[3] = static_cast(problem.output_spatial[0]); + lens[4] = static_cast(problem.output_spatial[1]); + lens[5] = static_cast(problem.output_spatial[2]); + } + + const auto make_default_strides = [&] { + constexpr auto layout = SIGNATURE.output.config.layout; - /// Convert the Args structure into a CK conv_param structure. This - /// function is mainly used to be able to use the existing - /// CK-functionality to obtain tensor descriptors. - ck::utils::conv::ConvParam to_ck_conv_param() const - { - const auto to_vector = [](const auto& extent) { if constexpr(SPATIAL_DIM == 1) - return std::vector{ck::index_t(extent.width)}; + { + if constexpr(layout == TensorLayout::GNKW) + return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3}); + else if constexpr(layout == TensorLayout::GNWK || + layout == TensorLayout::G_NW_K_strided) + return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2}); + else if constexpr(layout == TensorLayout::NWGK) + return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2}); + else if constexpr(layout == TensorLayout::NGKW) + return detail::make_packed_strides_for_order<4>(lens, {1, 0, 2, 3}); + else + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Unsupported 1D output layout for descriptor initialization."); + } else if constexpr(SPATIAL_DIM == 2) - return std::vector{ck::index_t(extent.height), - ck::index_t(extent.width)}; + { + if constexpr(layout == TensorLayout::GNKHW) + return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4}); + else if constexpr(layout == TensorLayout::GNHWK || + layout == TensorLayout::G_NHW_K_strided) + return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2}); + else if constexpr(layout == TensorLayout::NHWGK) + return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2}); + else if constexpr(layout == TensorLayout::NGKHW) + return detail::make_packed_strides_for_order<5>(lens, {1, 0, 2, 3, 4}); + else + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Unsupported 2D output layout for descriptor initialization."); + } else - return std::vector{ck::index_t(extent.depth), - ck::index_t(extent.height), - ck::index_t(extent.width)}; + { + if constexpr(layout == TensorLayout::GNKDHW) + return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5}); + else if constexpr(layout == TensorLayout::GNDHWK || + layout == TensorLayout::G_NDHW_K_strided) + return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2}); + else if constexpr(layout == TensorLayout::NDHWGK) + return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2}); + else if constexpr(layout == TensorLayout::NGKDHW) + return detail::make_packed_strides_for_order<6>(lens, {1, 0, 2, 3, 4, 5}); + else + static_assert(sizeof(UnsupportedEnumValue) == 0, + "Unsupported 3D output layout for descriptor initialization."); + } }; - return ck::utils::conv::ConvParam(SPATIAL_DIM, - this->lengths.groups, - this->lengths.batch_size, - this->lengths.output_channels, - this->lengths.input_channels, - to_vector(this->lengths.filter), - to_vector(this->lengths.image), - to_vector(this->filter_strides), - to_vector(this->filter_dilation), - to_vector(this->input_left_pad), - to_vector(this->input_right_pad)); + const Extent strides = + this->output_strides.has_value() ? *this->output_strides : make_default_strides(); + + return OutputDescriptor(lens, strides); } }; diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp index a90f53ba7d..c0b9b9c82f 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp @@ -116,7 +116,7 @@ void run(CkConvInstance auto& conv, return result; }; - const auto param = args.to_ck_conv_param(); + const auto problem = args.make_conv_problem(); const auto input_desc = args.make_input_descriptor(); const auto weight_desc = args.make_weight_descriptor(); @@ -134,10 +134,10 @@ void run(CkConvInstance auto& conv, {}, to_ck_lengths(output_desc.get_lengths()), to_ck_lengths(output_desc.get_strides()), - to_ck_extent(param.conv_filter_strides_), - to_ck_extent(param.conv_filter_dilations_), - to_ck_extent(param.input_left_pads_), - to_ck_extent(param.input_right_pads_), + to_ck_extent(problem.conv_strides), + to_ck_extent(problem.conv_dilations), + to_ck_extent(problem.left_pads), + to_ck_extent(problem.right_pads), args.a_elementwise_op, args.b_elementwise_op, args.cde_elementwise_op); diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp new file mode 100644 index 0000000000..79f46687c6 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp @@ -0,0 +1,110 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/builder/testing/conv_fwd.hpp" +#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp" +#include "ck_tile/host/kernel_launch.hpp" +#include +#include +#include +#include + +/// This file contains the implementation details for invoking/testing +/// grouped forward convolution operations using CK Tile kernels. +/// The main item is the `run()` function, which is used to invoke the +/// CK Tile grouped convolution forward kernel instances. + +namespace ck_tile::builder::test { + +template +concept TileConvInstance = requires { + typename Conv::CDElementwise; + typename Conv::GroupedConvFwdKernelArgsSpecialized; + { + Conv::MakeKernelArgs( + std::declval>()) + } -> std::convertible_to; +}; + +/// @brief `run()` specialization for forward convolution and CK Tile. +/// +/// @tparam SIGNATURE Forward convolution signature. +/// @throws std::runtime_error if the arguments weren't actually valid for the +/// operation. This should be caught and reported by the testing framework. +/// +/// @see run() +template + requires ValidConvSignature && ConvDirectionIsForward +void run(TileConvInstance auto& conv, + const Args& args, + const Inputs& inputs, + const Outputs& outputs) +{ + constexpr auto spatial_dim = SIGNATURE.spatial_dim; + + // For now, CK Tile EndToEnd only supports packed tensors. + // (Explicit custom strides are added as API in Args, but tile kernels + // are not yet wired for that in the testing framework.) + if(!args.make_input_descriptor().is_packed()) + throw std::runtime_error("TODO: Support non-packed input tensor in CK Tile runner"); + if(!args.make_weight_descriptor().is_packed()) + throw std::runtime_error("TODO: Support non-packed weight tensor in CK Tile runner"); + if(!args.make_output_descriptor().is_packed()) + throw std::runtime_error("TODO: Support non-packed output tensor in CK Tile runner"); + + const auto problem = args.make_conv_problem(); + + const std::vector input_spatial(problem.input_spatial.begin(), + problem.input_spatial.end()); + const std::vector filter_spatial(problem.filter_spatial.begin(), + problem.filter_spatial.end()); + const std::vector conv_strides(problem.conv_strides.begin(), + problem.conv_strides.end()); + const std::vector conv_dilations(problem.conv_dilations.begin(), + problem.conv_dilations.end()); + const std::vector left_pads(problem.left_pads.begin(), + problem.left_pads.end()); + const std::vector right_pads(problem.right_pads.begin(), + problem.right_pads.end()); + + // CK Tile host args are built around ck_tile::conv::ConvParam. + ck_tile::conv::ConvParam conv_param(static_cast(spatial_dim), + problem.G, + problem.N, + problem.K, + problem.C, + filter_spatial, + input_spatial, + conv_strides, + conv_dilations, + left_pads, + right_pads); + + using Kernel = std::remove_cvref_t; + using CDElementwise = typename Kernel::CDElementwise; + + ck_tile::GroupedConvFwdHostArgs host_args(conv_param, + inputs.input, + inputs.weight, + {}, + outputs.output, + /*k_batch=*/1, + CDElementwise{}); + + auto kargs = Kernel::MakeKernelArgs(host_args); + + if(!Kernel::IsSupportedArgument(kargs)) + { + throw std::runtime_error("invalid argument"); + } + + const dim3 grids = Kernel::GridSize(kargs); + const dim3 blocks = Kernel::BlockSize(); + + (void)ck_tile::launch_kernel(ck_tile::stream_config{}, + ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs)); +} + +} // namespace ck_tile::builder::test diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp index 85493e32eb..c7a2518eab 100644 --- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp +++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp @@ -74,16 +74,14 @@ void run(RefConvInstance auto& conv, const Inputs& inputs, const Outputs& outputs) { - // We don't want to compute the output dims manually, just get - // them via the existing infrastructure - const auto param = args.to_ck_conv_param(); + const auto problem = args.make_conv_problem(); // TODO: The reference convolution is currently missing a few features. // Just throw for now, but regard these as TODO items that should be resolved // eventually. // Right pads are not supported right now for some reason. - for(auto right_pad : param.input_right_pads_) + for(auto right_pad : problem.right_pads) { if(right_pad != 0) throw std::runtime_error("TODO: Support right pad in reference conv"); @@ -96,19 +94,31 @@ void run(RefConvInstance auto& conv, if(!args.make_output_descriptor().is_packed()) throw std::runtime_error("TODO: Support non-packed output tensor in reference conv"); + const std::vector input_spatial(problem.input_spatial.begin(), + problem.input_spatial.end()); + const std::vector filter_spatial(problem.filter_spatial.begin(), + problem.filter_spatial.end()); + const std::vector output_spatial(problem.output_spatial.begin(), + problem.output_spatial.end()); + const std::vector conv_strides(problem.conv_strides.begin(), + problem.conv_strides.end()); + const std::vector conv_dilations(problem.conv_dilations.begin(), + problem.conv_dilations.end()); + const std::vector left_pads(problem.left_pads.begin(), problem.left_pads.end()); + conv.Run(inputs.input, inputs.weight, outputs.output, - param.G_, - param.N_, - param.K_, - param.C_, - param.input_spatial_lengths_, - param.filter_spatial_lengths_, - param.output_spatial_lengths_, - param.conv_filter_strides_, - param.conv_filter_dilations_, - param.input_left_pads_); + problem.G, + problem.N, + problem.K, + problem.C, + input_spatial, + filter_spatial, + output_spatial, + conv_strides, + conv_dilations, + left_pads); } } // namespace ck_tile::builder::test diff --git a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp index 2c35fb5076..2d4c237790 100644 --- a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp +++ b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp @@ -3,33 +3,46 @@ #include "utils/ckb_conv_tile_test_configs.hpp" #include "utils/ckb_conv_test_utils.hpp" +#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp" +#include "ck_tile/builder/testing/conv_fwd_reference.hpp" +#include "ck_tile/host/device_prop.hpp" +#include "testing_utils.hpp" -namespace { +namespace ckb = ck_tile::builder; +namespace ckt = ck_tile::builder::test; +namespace cku = ck_tile::builder::test_utils; -using namespace ck_tile::builder::test_utils; +using ck_tile::test::MatchesReference; -TEST(FwdConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_2D_FP16_NHWGC) +constexpr auto SIGNATURE = + ckt::ConvSignature{.spatial_dim = 2, + .direction = ckb::ConvDirection::FORWARD, + .data_type = ckb::DataType::FP16, + .accumulation_data_type = ckb::DataType::FP32, + .input = {.config = {.layout = ckb::TensorLayout::NHWGC}}, + .weight = {.config = {.layout = ckb::TensorLayout::GKYXC}}, + .output = {.config = {.layout = ckb::TensorLayout::NHWGK}}}; + +constexpr auto TILE_ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{} + .with_tile_specializations(ckb::TileConvSpecialization::DEFAULT) + .with_tile_thread_block(cku::TileThreadBlock_64x64x64) + .with_tile_block_gemm(cku::TileBlockGemmDesc_16x16_v3_intrawave) + .with_tile_transfer(cku::TileTransfer_4x4x4) + .with_tile_optimizations(ckt::TileOptimizations{ + .num_groups_to_merge = 1, + .split_image = false, + .explicit_gemm = false, + }); + +using Builder = ckb::ConvBuilder; +using TileConv = Builder::Instance; +using Reference = ckb::ConvBuilder::Instance; + +TEST(Fwd2DFp16_TileV3_NHWGC, Create) { - constexpr ConvSignature FwdConvSignature{.spatial_dim = 2, - .direction = ConvDirection::FORWARD, - .data_type = DataType::FP16, - .accumulation_data_type = DataType::FP32, - .input = {.config = {.layout = TensorLayout::NHWGC}}, - .weight = {.config = {.layout = TensorLayout::GKYXC}}, - .output = {.config = {.layout = TensorLayout::NHWGK}}}; - - constexpr auto FwdConvAlgorithm = - ConvAlgorithm_Tile_GroupedConvolutionKernel{} - .with_tile_specializations(TileConvSpecialization::DEFAULT) - .with_tile_thread_block(TileThreadBlock_64x64x64) - .with_tile_block_gemm(TileBlockGemmDesc_16x16_v3_intrawave) - .with_tile_transfer(TileTransfer_4x4x4) - .with_tile_optimizations(TileOptimizations{ - .num_groups_to_merge = 1, .split_image = false, .explicit_gemm = false}); - - using Builder = ConvBuilder; - run_ck_tile_test({ - "grouped_convolution_forward", + const auto expected_type_string = "grouped_convolution_forward"; + cku::run_ck_tile_test({ + expected_type_string, "fp16", "NHWGC_GKYXC_NHWGK", "64x64x64", @@ -48,4 +61,51 @@ TEST(FwdConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_2D_FP1 }); } -} // namespace +TEST(Fwd2DFp16_TileV3_NHWGC, EndToEnd) +{ + if(!ck_tile::get_device_name().starts_with("gfx9")) + { + GTEST_SKIP() << "unsupported architecture"; + } + + ckt::Args args = { + .lengths = + { + .batch_size = 16, + .groups = 1, + .input_channels = 32, + .output_channels = 48, + .image = + { + .width = 56, + .height = 64, + }, + .filter = + { + .width = 3, + .height = 5, + }, + }, + .filter_strides = {.width = 1, .height = 1}, + .filter_dilation = {.width = 1, .height = 1}, + .input_left_pad = {.width = 0, .height = 0}, + .input_right_pad = {.width = 0, .height = 0}, + .a_elementwise_op = {}, + .b_elementwise_op = {}, + .cde_elementwise_op = {}, + }; + + auto inputs = ckt::alloc_inputs(args); + auto outputs = ckt::alloc_outputs(args); + auto reference = ckt::alloc_outputs(args); + + ckt::init_inputs(args, inputs.get()); + + auto tile_conv = TileConv{}; + ckt::run(tile_conv, args, inputs.get(), outputs.get()); + + auto ref_conv = Reference{}; + ckt::run(ref_conv, args, inputs.get(), reference.get()); + + EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get())); +} diff --git a/experimental/builder/test/unit_validation.cpp b/experimental/builder/test/unit_validation.cpp index a83d034ac2..7cc9968f21 100644 --- a/experimental/builder/test/unit_validation.cpp +++ b/experimental/builder/test/unit_validation.cpp @@ -142,10 +142,12 @@ TEST(ValidationReportTests, MultipleSomeIncorrect) auto a = ckt::alloc_tensor_buffer(desc); auto b = ckt::alloc_tensor_buffer(desc); - ckt::fill_tensor_buffer( - desc, a.get(), [](size_t i) { return ck::type_convert(i % 100); }); - ckt::fill_tensor_buffer( - desc, b.get(), [](size_t i) { return ck::type_convert(i % 101); }); + ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) { + return ck::type_convert(static_cast(i % 100)); + }); + ckt::fill_tensor_buffer(desc, b.get(), [](size_t i) { + return ck::type_convert(static_cast(i % 101)); + }); report.check("incorrect 1", desc, b.get(), a.get()); }