From ae55803b39d3e4dfda13e0180d8a51cefb68aaa2 Mon Sep 17 00:00:00 2001
From: JH-Leon-KIM-AMD <jeonghyun.kim@amd.com>
Date: Wed, 14 Jan 2026 12:16:25 +0000
Subject: [PATCH] [CK_BUILDER] ALMIOPEN-522: Testing-specific descriptor
 initialization

Remove old CK host descriptor helper dependency from CK Builder testing
framework and implement testing-owned descriptor computation.

Core changes (ALMIOPEN-522):
- Remove ck/library/utility/convolution_* includes from conv_fwd.hpp
- Add ConvFwdProblem struct (testing-owned conv parameter description)
- Add Args::make_conv_problem() (computes output spatial, no old CK dependency)
- Rewrite make_input/weight/output_descriptor() with testing-specific
  stride mapping for all supported layouts (GNHWC, NHWGC, etc.)
- Add optional explicit tensor-memory stride API (std::optional fields)
- Update conv_fwd_ck.hpp and conv_fwd_reference.hpp to use make_conv_problem()
- Remove to_ck_conv_param() method

Additional improvements:
- Add CK Tile forward EndToEnd support (conv_fwd_ck_tile.hpp + test)
- Proves shared Args/descriptor design works across backends
- Merge Create + EndToEnd tests into single file (matches old CK pattern)
- Fix unit_validation.cpp BF16 initialization for reliable testing

Results:
- Old CK warnings removed (no more RowMajor/ColumnMajor spam)
- All smoke tests pass (5/5)
- Old CK EndToEnd passes (XDL vs GPU reference)
- CK Tile EndToEnd passes (Tile vs GPU reference)

Note: This PR conflicts with #3518 (tile-builder-testing). Both touch
conv_fwd.hpp but with different approaches. This implementation directly
addresses ALMIOPEN-522 requirements by removing old CK dependency.
---
 .../ck_tile/builder/testing/conv_fwd.hpp      | 430 +++++++++++++++---
 .../ck_tile/builder/testing/conv_fwd_ck.hpp   |  10 +-
 .../builder/testing/conv_fwd_ck_tile.hpp      | 110 +++++
 .../builder/testing/conv_fwd_reference.hpp    |  38 +-
 .../ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp  | 108 ++++-
 experimental/builder/test/unit_validation.cpp |  10 +-
 6 files changed, 602 insertions(+), 104 deletions(-)
 create mode 100644 experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
index d8910152dd..44596a4d2d 100644
--- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd.hpp
@@ -3,8 +3,11 @@
 
 #pragma once
 
+#include <array>
+#include <optional>
+#include <stdexcept>
+
 #include "ck_tile/builder/conv_signature_concepts.hpp"
-#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp"
 #include "ck_tile/builder/factory/helpers/ck/conv_elementwise_op.hpp"
 #include "ck_tile/builder/testing/testing.hpp"
 #include "ck_tile/builder/testing/testing_reflect.hpp"
@@ -13,8 +16,6 @@
 #include "ck_tile/builder/testing/tensor_initialization.hpp"
 #include "ck_tile/builder/testing/tensor_descriptor.hpp"
 #include "ck_tile/builder/testing/validation.hpp"
-#include "ck/library/utility/convolution_parameter.hpp"
-#include "ck/library/utility/convolution_host_tensor_descriptor_helper.hpp"
 
 /// This file implements common functionality for invoking/testing grouped
 /// forward convolutions created through the CK Builder API. The main item
@@ -49,6 +50,124 @@ struct ConvTensorLengths
     FilterExtent<SPATIAL_DIM> filter = {}; // X, Y, Z
 };
 
+/// @brief Convolution parameters derived from `Args`.
+///
+/// This structure contains all runtime convolution parameters needed by:
+/// - descriptor generation (output spatial sizes)
+/// - kernel invocation (stride/dilation/pads)
+///
+/// It intentionally does NOT depend on old CK utility types (e.g. ConvParam).
+template <int SPATIAL_DIM>
+struct ConvFwdProblem
+{
+    int G = 1;
+    int N = 1;
+    int C = 1;
+    int K = 1;
+
+    std::array<long_index_t, SPATIAL_DIM> input_spatial  = {};
+    std::array<long_index_t, SPATIAL_DIM> filter_spatial = {};
+    std::array<long_index_t, SPATIAL_DIM> output_spatial = {};
+
+    std::array<long_index_t, SPATIAL_DIM> conv_strides   = {};
+    std::array<long_index_t, SPATIAL_DIM> conv_dilations = {};
+    std::array<long_index_t, SPATIAL_DIM> left_pads      = {};
+    std::array<long_index_t, SPATIAL_DIM> right_pads     = {};
+};
+
+namespace detail {
+
+/// @brief Calculate memory strides for a tensor with custom dimension ordering.
+///
+/// Given tensor dimensions and a memory layout order, compute the stride
+/// (memory jump size) needed to move by 1 in each dimension.
+///
+/// @param lengths Tensor dimensions (e.g., {3, 4} for 3 rows × 4 columns)
+/// @param outer_to_inner Dimension ordering from outermost to innermost in memory
+/// @return Strides for each dimension (e.g., {4, 1} for row-major 3×4 tensor)
+///
+/// Example: For a 3×4 tensor stored row-major (outer_to_inner = {0, 1}):
+///   - Moving 1 row down requires jumping 4 positions → stride[0] = 4
+///   - Moving 1 column right requires jumping 1 position → stride[1] = 1
+template <size_t RANK>
+Extent<RANK> make_packed_strides_for_order(const Extent<RANK>& lengths,
+                                           const std::array<size_t, RANK>& outer_to_inner)
+{
+    Extent<RANK> strides = {};
+
+    size_t stride = 1; // Innermost dimension always has stride 1
+    for(size_t i = RANK; i > 0; --i)
+    {
+        const auto dim = outer_to_inner[i - 1]; // Get dimension at this position
+        strides[dim]   = stride;                // Assign current stride
+        stride *= lengths[dim];                 // Update stride for next (outer) dimension
+    }
+
+    return strides;
+}
+
+template <int SPATIAL_DIM>
+std::array<long_index_t, SPATIAL_DIM> to_spatial_array(const FilterExtent<SPATIAL_DIM>& extent)
+{
+    if constexpr(SPATIAL_DIM == 1)
+    {
+        return {static_cast<long_index_t>(extent.width)};
+    }
+    else if constexpr(SPATIAL_DIM == 2)
+    {
+        // CK Builder uses spatial ordering {H, W} for 2D.
+        return {static_cast<long_index_t>(extent.height), static_cast<long_index_t>(extent.width)};
+    }
+    else
+    {
+        // CK Builder uses spatial ordering {D, H, W} for 3D.
+        return {static_cast<long_index_t>(extent.depth),
+                static_cast<long_index_t>(extent.height),
+                static_cast<long_index_t>(extent.width)};
+    }
+}
+
+template <int SPATIAL_DIM>
+std::array<long_index_t, SPATIAL_DIM>
+compute_output_spatial(const std::array<long_index_t, SPATIAL_DIM>& input_spatial,
+                       const std::array<long_index_t, SPATIAL_DIM>& filter_spatial,
+                       const std::array<long_index_t, SPATIAL_DIM>& conv_strides,
+                       const std::array<long_index_t, SPATIAL_DIM>& conv_dilations,
+                       const std::array<long_index_t, SPATIAL_DIM>& left_pads,
+                       const std::array<long_index_t, SPATIAL_DIM>& right_pads)
+{
+    std::array<long_index_t, SPATIAL_DIM> output_spatial = {};
+
+    for(int i = 0; i < SPATIAL_DIM; ++i)
+    {
+        const auto in  = input_spatial[i];
+        const auto fil = filter_spatial[i];
+        const auto s   = conv_strides[i];
+        const auto d   = conv_dilations[i];
+        const auto pl  = left_pads[i];
+        const auto pr  = right_pads[i];
+
+        // effective_filter = dilation*(filter-1) + 1
+        const auto effective_filter = d * (fil - 1) + 1;
+        const auto numerator        = in + pl + pr - effective_filter;
+
+        if(s <= 0)
+        {
+            throw std::runtime_error("invalid convolution stride (must be > 0)");
+        }
+        if(numerator < 0)
+        {
+            throw std::runtime_error("invalid convolution parameters (negative output spatial)");
+        }
+
+        output_spatial[i] = numerator / s + 1;
+    }
+
+    return output_spatial;
+}
+
+} // namespace detail
+
 /// @brief `Args` specialization for forward convolution.
 ///
 /// @tparam SIGNATURE Forward convolution signature.
@@ -74,15 +193,16 @@ struct Args<SIGNATURE>
     // TODO: We shouldn't need to call into an internal namespace here.
     using Ops = factory::internal::ConvElementwiseOps<SIGNATURE>;
 
-    // TODO: We shouldn't need to call into an internal namespace here.
-    using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
-
     ConvTensorLengths<SPATIAL_DIM> lengths;
 
-    // TODO: Tensor strides. This needs a new structure as well as some
-    // reworking of the make_*_descriptor() functions, as the current
-    // implementation (based on ConvParam in old CK / CK Tile) does not
-    // support strides at all.
+    // Optional explicit tensor-memory strides (in elements), for custom/non-packed tensors.
+    // When not set, packed strides are derived automatically from the selected TensorLayout.
+    // NOTE: These have explicit default initializers to avoid
+    // -Wmissing-designated-field-initializers when `Args` is aggregate-initialized
+    // using designated initializers in tests.
+    std::optional<typename InputDescriptor::Extent> input_strides   = std::nullopt;
+    std::optional<typename WeightDescriptor::Extent> weight_strides = std::nullopt;
+    std::optional<typename OutputDescriptor::Extent> output_strides = std::nullopt;
 
     FilterExtent<SPATIAL_DIM> filter_strides;
     FilterExtent<SPATIAL_DIM> filter_dilation;
@@ -93,23 +213,115 @@ struct Args<SIGNATURE>
     Ops::WeiElementwiseOp b_elementwise_op;
     Ops::OutElementwiseOp cde_elementwise_op;
 
+    /// @brief Convert `Args` into a testing-owned forward convolution problem description.
+    ConvFwdProblem<SPATIAL_DIM> make_conv_problem() const
+    {
+        ConvFwdProblem<SPATIAL_DIM> problem;
+
+        problem.G = static_cast<int>(this->lengths.groups);
+        problem.N = static_cast<int>(this->lengths.batch_size);
+        problem.C = static_cast<int>(this->lengths.input_channels);
+        problem.K = static_cast<int>(this->lengths.output_channels);
+
+        problem.input_spatial  = detail::to_spatial_array<SPATIAL_DIM>(this->lengths.image);
+        problem.filter_spatial = detail::to_spatial_array<SPATIAL_DIM>(this->lengths.filter);
+
+        problem.conv_strides   = detail::to_spatial_array<SPATIAL_DIM>(this->filter_strides);
+        problem.conv_dilations = detail::to_spatial_array<SPATIAL_DIM>(this->filter_dilation);
+        problem.left_pads      = detail::to_spatial_array<SPATIAL_DIM>(this->input_left_pad);
+        problem.right_pads     = detail::to_spatial_array<SPATIAL_DIM>(this->input_right_pad);
+
+        problem.output_spatial = detail::compute_output_spatial<SPATIAL_DIM>(problem.input_spatial,
+                                                                             problem.filter_spatial,
+                                                                             problem.conv_strides,
+                                                                             problem.conv_dilations,
+                                                                             problem.left_pads,
+                                                                             problem.right_pads);
+
+        return problem;
+    }
+
     /// This function returns the `TensorDescriptor` corresponding to
     /// the input-tensor of the convolution problem. This can then
     /// be used to, for example, allocate memory.
     InputDescriptor make_input_descriptor() const
     {
-        // TODO: We're using old CK functionality to compute the right
-        // values here, mainly because CK tile does not support the
-        // right tensor layouts here. We should probably change that
-        // because CK currently prints an annoying message about it,
-        // plus that would let us get rid of the `to_ck_conv_param()`
-        // function.
-        const auto param = to_ck_conv_param();
-        const auto desc  = ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed<
-             typename Layouts::InLayout>(param);
         using Extent = typename InputDescriptor::Extent;
-        return InputDescriptor(Extent::from_vector(desc.GetLengths()),
-                               Extent::from_vector(desc.GetStrides()));
+        Extent lens  = {};
+
+        lens[0] = this->lengths.groups;
+        lens[1] = this->lengths.batch_size;
+        lens[2] = this->lengths.input_channels;
+        if constexpr(SPATIAL_DIM == 1)
+        {
+            lens[3] = this->lengths.image.width;
+        }
+        else if constexpr(SPATIAL_DIM == 2)
+        {
+            lens[3] = this->lengths.image.height;
+            lens[4] = this->lengths.image.width;
+        }
+        else
+        {
+            lens[3] = this->lengths.image.depth;
+            lens[4] = this->lengths.image.height;
+            lens[5] = this->lengths.image.width;
+        }
+
+        const auto make_default_strides = [&] {
+            constexpr auto layout = SIGNATURE.input.config.layout;
+
+            if constexpr(SPATIAL_DIM == 1)
+            {
+                if constexpr(layout == TensorLayout::GNCW)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3});
+                else if constexpr(layout == TensorLayout::GNWC ||
+                                  layout == TensorLayout::G_NW_C_strided)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2});
+                else if constexpr(layout == TensorLayout::NWGC)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2});
+                else if constexpr(layout == TensorLayout::NGCW)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 0, 2, 3});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 1D input layout for descriptor initialization.");
+            }
+            else if constexpr(SPATIAL_DIM == 2)
+            {
+                if constexpr(layout == TensorLayout::GNCHW)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4});
+                else if constexpr(layout == TensorLayout::GNHWC ||
+                                  layout == TensorLayout::G_NHW_C_strided)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2});
+                else if constexpr(layout == TensorLayout::NHWGC)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2});
+                else if constexpr(layout == TensorLayout::NGCHW)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 0, 2, 3, 4});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 2D input layout for descriptor initialization.");
+            }
+            else
+            {
+                if constexpr(layout == TensorLayout::GNCDHW)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5});
+                else if constexpr(layout == TensorLayout::GNDHWC ||
+                                  layout == TensorLayout::G_NDHW_C_strided)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2});
+                else if constexpr(layout == TensorLayout::NDHWGC)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2});
+                else if constexpr(layout == TensorLayout::NGCDHW)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 0, 2, 3, 4, 5});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 3D input layout for descriptor initialization.");
+            }
+        };
+
+        const Extent strides =
+            this->input_strides.has_value() ? *this->input_strides : make_default_strides();
+
+        return InputDescriptor(lens, strides);
     }
 
     /// This function returns the `TensorDescriptor` corresponding to
@@ -117,13 +329,76 @@ struct Args<SIGNATURE>
     /// be used to, for example, allocate memory.
     WeightDescriptor make_weight_descriptor() const
     {
-        // See note in implementation of `make_input_descriptor`.
-        const auto param = to_ck_conv_param();
-        const auto desc  = ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed<
-             typename Layouts::WeiLayout>(param);
         using Extent = typename WeightDescriptor::Extent;
-        return WeightDescriptor(Extent::from_vector(desc.GetLengths()),
-                                Extent::from_vector(desc.GetStrides()));
+        Extent lens  = {};
+
+        lens[0] = this->lengths.groups;
+        lens[1] = this->lengths.output_channels;
+        lens[2] = this->lengths.input_channels;
+        if constexpr(SPATIAL_DIM == 1)
+        {
+            lens[3] = this->lengths.filter.width;
+        }
+        else if constexpr(SPATIAL_DIM == 2)
+        {
+            lens[3] = this->lengths.filter.height;
+            lens[4] = this->lengths.filter.width;
+        }
+        else
+        {
+            lens[3] = this->lengths.filter.depth;
+            lens[4] = this->lengths.filter.height;
+            lens[5] = this->lengths.filter.width;
+        }
+
+        const auto make_default_strides = [&] {
+            constexpr auto layout = SIGNATURE.weight.config.layout;
+
+            if constexpr(SPATIAL_DIM == 1)
+            {
+                if constexpr(layout == TensorLayout::GKCX)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3});
+                else if constexpr(layout == TensorLayout::GKXC ||
+                                  layout == TensorLayout::G_K_X_C_strided)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2});
+                else if constexpr(layout == TensorLayout::KXGC)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 1D weight layout for descriptor initialization.");
+            }
+            else if constexpr(SPATIAL_DIM == 2)
+            {
+                if constexpr(layout == TensorLayout::GKCYX)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4});
+                else if constexpr(layout == TensorLayout::GKYXC ||
+                                  layout == TensorLayout::G_K_YX_C_strided)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2});
+                else if constexpr(layout == TensorLayout::KYXGC)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 2D weight layout for descriptor initialization.");
+            }
+            else
+            {
+                if constexpr(layout == TensorLayout::GKCZYX)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5});
+                else if constexpr(layout == TensorLayout::GKZYXC ||
+                                  layout == TensorLayout::G_K_ZYX_C_strided)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2});
+                else if constexpr(layout == TensorLayout::KZYXGC)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 3D weight layout for descriptor initialization.");
+            }
+        };
+
+        const Extent strides =
+            this->weight_strides.has_value() ? *this->weight_strides : make_default_strides();
+
+        return WeightDescriptor(lens, strides);
     }
 
     /// This function returns the `TensorDescriptor` corresponding to
@@ -131,43 +406,84 @@ struct Args<SIGNATURE>
     /// be used to, for example, allocate memory.
     OutputDescriptor make_output_descriptor() const
     {
-        // See note in implementation of `make_input_descriptor`.
-        const auto param = to_ck_conv_param();
-        const auto desc  = ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed<
-             typename Layouts::OutLayout>(param);
         using Extent = typename OutputDescriptor::Extent;
-        return OutputDescriptor(Extent::from_vector(desc.GetLengths()),
-                                Extent::from_vector(desc.GetStrides()));
-    }
+        Extent lens  = {};
+
+        const auto problem = make_conv_problem();
+
+        lens[0] = this->lengths.groups;
+        lens[1] = this->lengths.batch_size;
+        lens[2] = this->lengths.output_channels;
+        if constexpr(SPATIAL_DIM == 1)
+        {
+            lens[3] = static_cast<size_t>(problem.output_spatial[0]);
+        }
+        else if constexpr(SPATIAL_DIM == 2)
+        {
+            lens[3] = static_cast<size_t>(problem.output_spatial[0]);
+            lens[4] = static_cast<size_t>(problem.output_spatial[1]);
+        }
+        else
+        {
+            lens[3] = static_cast<size_t>(problem.output_spatial[0]);
+            lens[4] = static_cast<size_t>(problem.output_spatial[1]);
+            lens[5] = static_cast<size_t>(problem.output_spatial[2]);
+        }
+
+        const auto make_default_strides = [&] {
+            constexpr auto layout = SIGNATURE.output.config.layout;
 
-    /// Convert the Args structure into a CK conv_param structure. This
-    /// function is mainly used to be able to use the existing
-    /// CK-functionality to obtain tensor descriptors.
-    ck::utils::conv::ConvParam to_ck_conv_param() const
-    {
-        const auto to_vector = [](const auto& extent) {
             if constexpr(SPATIAL_DIM == 1)
-                return std::vector<ck::index_t>{ck::index_t(extent.width)};
+            {
+                if constexpr(layout == TensorLayout::GNKW)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 2, 3});
+                else if constexpr(layout == TensorLayout::GNWK ||
+                                  layout == TensorLayout::G_NW_K_strided)
+                    return detail::make_packed_strides_for_order<4>(lens, {0, 1, 3, 2});
+                else if constexpr(layout == TensorLayout::NWGK)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 3, 0, 2});
+                else if constexpr(layout == TensorLayout::NGKW)
+                    return detail::make_packed_strides_for_order<4>(lens, {1, 0, 2, 3});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 1D output layout for descriptor initialization.");
+            }
             else if constexpr(SPATIAL_DIM == 2)
-                return std::vector<ck::index_t>{ck::index_t(extent.height),
-                                                ck::index_t(extent.width)};
+            {
+                if constexpr(layout == TensorLayout::GNKHW)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 2, 3, 4});
+                else if constexpr(layout == TensorLayout::GNHWK ||
+                                  layout == TensorLayout::G_NHW_K_strided)
+                    return detail::make_packed_strides_for_order<5>(lens, {0, 1, 3, 4, 2});
+                else if constexpr(layout == TensorLayout::NHWGK)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 3, 4, 0, 2});
+                else if constexpr(layout == TensorLayout::NGKHW)
+                    return detail::make_packed_strides_for_order<5>(lens, {1, 0, 2, 3, 4});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 2D output layout for descriptor initialization.");
+            }
             else
-                return std::vector<ck::index_t>{ck::index_t(extent.depth),
-                                                ck::index_t(extent.height),
-                                                ck::index_t(extent.width)};
+            {
+                if constexpr(layout == TensorLayout::GNKDHW)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 2, 3, 4, 5});
+                else if constexpr(layout == TensorLayout::GNDHWK ||
+                                  layout == TensorLayout::G_NDHW_K_strided)
+                    return detail::make_packed_strides_for_order<6>(lens, {0, 1, 3, 4, 5, 2});
+                else if constexpr(layout == TensorLayout::NDHWGK)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 3, 4, 5, 0, 2});
+                else if constexpr(layout == TensorLayout::NGKDHW)
+                    return detail::make_packed_strides_for_order<6>(lens, {1, 0, 2, 3, 4, 5});
+                else
+                    static_assert(sizeof(UnsupportedEnumValue<layout>) == 0,
+                                  "Unsupported 3D output layout for descriptor initialization.");
+            }
         };
 
-        return ck::utils::conv::ConvParam(SPATIAL_DIM,
-                                          this->lengths.groups,
-                                          this->lengths.batch_size,
-                                          this->lengths.output_channels,
-                                          this->lengths.input_channels,
-                                          to_vector(this->lengths.filter),
-                                          to_vector(this->lengths.image),
-                                          to_vector(this->filter_strides),
-                                          to_vector(this->filter_dilation),
-                                          to_vector(this->input_left_pad),
-                                          to_vector(this->input_right_pad));
+        const Extent strides =
+            this->output_strides.has_value() ? *this->output_strides : make_default_strides();
+
+        return OutputDescriptor(lens, strides);
     }
 };
 
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp
index a90f53ba7d..c0b9b9c82f 100644
--- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck.hpp
@@ -116,7 +116,7 @@ void run(CkConvInstance<SIGNATURE> auto& conv,
         return result;
     };
 
-    const auto param = args.to_ck_conv_param();
+    const auto problem = args.make_conv_problem();
 
     const auto input_desc  = args.make_input_descriptor();
     const auto weight_desc = args.make_weight_descriptor();
@@ -134,10 +134,10 @@ void run(CkConvInstance<SIGNATURE> auto& conv,
                                      {},
                                      to_ck_lengths(output_desc.get_lengths()),
                                      to_ck_lengths(output_desc.get_strides()),
-                                     to_ck_extent(param.conv_filter_strides_),
-                                     to_ck_extent(param.conv_filter_dilations_),
-                                     to_ck_extent(param.input_left_pads_),
-                                     to_ck_extent(param.input_right_pads_),
+                                     to_ck_extent(problem.conv_strides),
+                                     to_ck_extent(problem.conv_dilations),
+                                     to_ck_extent(problem.left_pads),
+                                     to_ck_extent(problem.right_pads),
                                      args.a_elementwise_op,
                                      args.b_elementwise_op,
                                      args.cde_elementwise_op);
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp
new file mode 100644
index 0000000000..79f46687c6
--- /dev/null
+++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_ck_tile.hpp
@@ -0,0 +1,110 @@
+// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier: MIT
+
+#pragma once
+
+#include "ck_tile/builder/testing/conv_fwd.hpp"
+#include "ck_tile/ops/grouped_convolution/utils/grouped_convolution_utils.hpp"
+#include "ck_tile/host/kernel_launch.hpp"
+#include <stdexcept>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+/// This file contains the implementation details for invoking/testing
+/// grouped forward convolution operations using CK Tile kernels.
+/// The main item is the `run()` function, which is used to invoke the
+/// CK Tile grouped convolution forward kernel instances.
+
+namespace ck_tile::builder::test {
+
+template <typename Conv, auto SIGNATURE>
+concept TileConvInstance = requires {
+    typename Conv::CDElementwise;
+    typename Conv::GroupedConvFwdKernelArgsSpecialized;
+    {
+        Conv::MakeKernelArgs(
+            std::declval<ck_tile::GroupedConvFwdHostArgs<typename Conv::CDElementwise>>())
+    } -> std::convertible_to<typename Conv::GroupedConvFwdKernelArgsSpecialized>;
+};
+
+/// @brief `run()` specialization for forward convolution and CK Tile.
+///
+/// @tparam SIGNATURE Forward convolution signature.
+/// @throws std::runtime_error if the arguments weren't actually valid for the
+/// operation. This should be caught and reported by the testing framework.
+///
+/// @see run()
+template <auto SIGNATURE>
+    requires ValidConvSignature<SIGNATURE> && ConvDirectionIsForward<SIGNATURE>
+void run(TileConvInstance<SIGNATURE> auto& conv,
+         const Args<SIGNATURE>& args,
+         const Inputs<SIGNATURE>& inputs,
+         const Outputs<SIGNATURE>& outputs)
+{
+    constexpr auto spatial_dim = SIGNATURE.spatial_dim;
+
+    // For now, CK Tile EndToEnd only supports packed tensors.
+    // (Explicit custom strides are added as API in Args, but tile kernels
+    // are not yet wired for that in the testing framework.)
+    if(!args.make_input_descriptor().is_packed())
+        throw std::runtime_error("TODO: Support non-packed input tensor in CK Tile runner");
+    if(!args.make_weight_descriptor().is_packed())
+        throw std::runtime_error("TODO: Support non-packed weight tensor in CK Tile runner");
+    if(!args.make_output_descriptor().is_packed())
+        throw std::runtime_error("TODO: Support non-packed output tensor in CK Tile runner");
+
+    const auto problem = args.make_conv_problem();
+
+    const std::vector<ck_tile::long_index_t> input_spatial(problem.input_spatial.begin(),
+                                                           problem.input_spatial.end());
+    const std::vector<ck_tile::long_index_t> filter_spatial(problem.filter_spatial.begin(),
+                                                            problem.filter_spatial.end());
+    const std::vector<ck_tile::long_index_t> conv_strides(problem.conv_strides.begin(),
+                                                          problem.conv_strides.end());
+    const std::vector<ck_tile::long_index_t> conv_dilations(problem.conv_dilations.begin(),
+                                                            problem.conv_dilations.end());
+    const std::vector<ck_tile::long_index_t> left_pads(problem.left_pads.begin(),
+                                                       problem.left_pads.end());
+    const std::vector<ck_tile::long_index_t> right_pads(problem.right_pads.begin(),
+                                                        problem.right_pads.end());
+
+    // CK Tile host args are built around ck_tile::conv::ConvParam.
+    ck_tile::conv::ConvParam conv_param(static_cast<ck_tile::long_index_t>(spatial_dim),
+                                        problem.G,
+                                        problem.N,
+                                        problem.K,
+                                        problem.C,
+                                        filter_spatial,
+                                        input_spatial,
+                                        conv_strides,
+                                        conv_dilations,
+                                        left_pads,
+                                        right_pads);
+
+    using Kernel        = std::remove_cvref_t<decltype(conv)>;
+    using CDElementwise = typename Kernel::CDElementwise;
+
+    ck_tile::GroupedConvFwdHostArgs<CDElementwise> host_args(conv_param,
+                                                             inputs.input,
+                                                             inputs.weight,
+                                                             {},
+                                                             outputs.output,
+                                                             /*k_batch=*/1,
+                                                             CDElementwise{});
+
+    auto kargs = Kernel::MakeKernelArgs(host_args);
+
+    if(!Kernel::IsSupportedArgument(kargs))
+    {
+        throw std::runtime_error("invalid argument");
+    }
+
+    const dim3 grids  = Kernel::GridSize(kargs);
+    const dim3 blocks = Kernel::BlockSize();
+
+    (void)ck_tile::launch_kernel(ck_tile::stream_config{},
+                                 ck_tile::make_kernel(Kernel{}, grids, blocks, 0, kargs));
+}
+
+} // namespace ck_tile::builder::test
diff --git a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp
index 85493e32eb..c7a2518eab 100644
--- a/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp
+++ b/experimental/builder/include/ck_tile/builder/testing/conv_fwd_reference.hpp
@@ -74,16 +74,14 @@ void run(RefConvInstance<SIGNATURE> auto& conv,
          const Inputs<SIGNATURE>& inputs,
          const Outputs<SIGNATURE>& outputs)
 {
-    // We don't want to compute the output dims manually, just get
-    // them via the existing infrastructure
-    const auto param = args.to_ck_conv_param();
+    const auto problem = args.make_conv_problem();
 
     // TODO: The reference convolution is currently missing a few features.
     // Just throw for now, but regard these as TODO items that should be resolved
     // eventually.
 
     // Right pads are not supported right now for some reason.
-    for(auto right_pad : param.input_right_pads_)
+    for(auto right_pad : problem.right_pads)
     {
         if(right_pad != 0)
             throw std::runtime_error("TODO: Support right pad in reference conv");
@@ -96,19 +94,31 @@ void run(RefConvInstance<SIGNATURE> auto& conv,
     if(!args.make_output_descriptor().is_packed())
         throw std::runtime_error("TODO: Support non-packed output tensor in reference conv");
 
+    const std::vector<long_index_t> input_spatial(problem.input_spatial.begin(),
+                                                  problem.input_spatial.end());
+    const std::vector<long_index_t> filter_spatial(problem.filter_spatial.begin(),
+                                                   problem.filter_spatial.end());
+    const std::vector<long_index_t> output_spatial(problem.output_spatial.begin(),
+                                                   problem.output_spatial.end());
+    const std::vector<long_index_t> conv_strides(problem.conv_strides.begin(),
+                                                 problem.conv_strides.end());
+    const std::vector<long_index_t> conv_dilations(problem.conv_dilations.begin(),
+                                                   problem.conv_dilations.end());
+    const std::vector<long_index_t> left_pads(problem.left_pads.begin(), problem.left_pads.end());
+
     conv.Run(inputs.input,
              inputs.weight,
              outputs.output,
-             param.G_,
-             param.N_,
-             param.K_,
-             param.C_,
-             param.input_spatial_lengths_,
-             param.filter_spatial_lengths_,
-             param.output_spatial_lengths_,
-             param.conv_filter_strides_,
-             param.conv_filter_dilations_,
-             param.input_left_pads_);
+             problem.G,
+             problem.N,
+             problem.K,
+             problem.C,
+             input_spatial,
+             filter_spatial,
+             output_spatial,
+             conv_strides,
+             conv_dilations,
+             left_pads);
 }
 
 } // namespace ck_tile::builder::test
diff --git a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp
index 2c35fb5076..2d4c237790 100644
--- a/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp
+++ b/experimental/builder/test/conv/ck_tile/test_ckb_conv_fwd_2d_fp16_v3.cpp
@@ -3,33 +3,46 @@
 
 #include "utils/ckb_conv_tile_test_configs.hpp"
 #include "utils/ckb_conv_test_utils.hpp"
+#include "ck_tile/builder/testing/conv_fwd_ck_tile.hpp"
+#include "ck_tile/builder/testing/conv_fwd_reference.hpp"
+#include "ck_tile/host/device_prop.hpp"
+#include "testing_utils.hpp"
 
-namespace {
+namespace ckb = ck_tile::builder;
+namespace ckt = ck_tile::builder::test;
+namespace cku = ck_tile::builder::test_utils;
 
-using namespace ck_tile::builder::test_utils;
+using ck_tile::test::MatchesReference;
 
-TEST(FwdConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_2D_FP16_NHWGC)
+constexpr auto SIGNATURE =
+    ckt::ConvSignature{.spatial_dim            = 2,
+                       .direction              = ckb::ConvDirection::FORWARD,
+                       .data_type              = ckb::DataType::FP16,
+                       .accumulation_data_type = ckb::DataType::FP32,
+                       .input                  = {.config = {.layout = ckb::TensorLayout::NHWGC}},
+                       .weight                 = {.config = {.layout = ckb::TensorLayout::GKYXC}},
+                       .output                 = {.config = {.layout = ckb::TensorLayout::NHWGK}}};
+
+constexpr auto TILE_ALGORITHM = cku::ConvAlgorithm_Tile_GroupedConvolutionKernel{}
+                                    .with_tile_specializations(ckb::TileConvSpecialization::DEFAULT)
+                                    .with_tile_thread_block(cku::TileThreadBlock_64x64x64)
+                                    .with_tile_block_gemm(cku::TileBlockGemmDesc_16x16_v3_intrawave)
+                                    .with_tile_transfer(cku::TileTransfer_4x4x4)
+                                    .with_tile_optimizations(ckt::TileOptimizations{
+                                        .num_groups_to_merge = 1,
+                                        .split_image         = false,
+                                        .explicit_gemm       = false,
+                                    });
+
+using Builder   = ckb::ConvBuilder<SIGNATURE, TILE_ALGORITHM>;
+using TileConv  = Builder::Instance;
+using Reference = ckb::ConvBuilder<SIGNATURE, ckt::ConvAlgorithm_Reference{}>::Instance;
+
+TEST(Fwd2DFp16_TileV3_NHWGC, Create)
 {
-    constexpr ConvSignature FwdConvSignature{.spatial_dim            = 2,
-                                             .direction              = ConvDirection::FORWARD,
-                                             .data_type              = DataType::FP16,
-                                             .accumulation_data_type = DataType::FP32,
-                                             .input  = {.config = {.layout = TensorLayout::NHWGC}},
-                                             .weight = {.config = {.layout = TensorLayout::GKYXC}},
-                                             .output = {.config = {.layout = TensorLayout::NHWGK}}};
-
-    constexpr auto FwdConvAlgorithm =
-        ConvAlgorithm_Tile_GroupedConvolutionKernel{}
-            .with_tile_specializations(TileConvSpecialization::DEFAULT)
-            .with_tile_thread_block(TileThreadBlock_64x64x64)
-            .with_tile_block_gemm(TileBlockGemmDesc_16x16_v3_intrawave)
-            .with_tile_transfer(TileTransfer_4x4x4)
-            .with_tile_optimizations(TileOptimizations{
-                .num_groups_to_merge = 1, .split_image = false, .explicit_gemm = false});
-
-    using Builder = ConvBuilder<FwdConvSignature, FwdConvAlgorithm>;
-    run_ck_tile_test<Builder>({
-        "grouped_convolution_forward",
+    const auto expected_type_string = "grouped_convolution_forward";
+    cku::run_ck_tile_test<Builder>({
+        expected_type_string,
         "fp16",
         "NHWGC_GKYXC_NHWGK",
         "64x64x64",
@@ -48,4 +61,51 @@ TEST(FwdConvInstances, Create_ConvAlgorithm_Tile_GroupedConvolutionKernel_2D_FP1
     });
 }
 
-} // namespace
+TEST(Fwd2DFp16_TileV3_NHWGC, EndToEnd)
+{
+    if(!ck_tile::get_device_name().starts_with("gfx9"))
+    {
+        GTEST_SKIP() << "unsupported architecture";
+    }
+
+    ckt::Args<SIGNATURE> args = {
+        .lengths =
+            {
+                .batch_size      = 16,
+                .groups          = 1,
+                .input_channels  = 32,
+                .output_channels = 48,
+                .image =
+                    {
+                        .width  = 56,
+                        .height = 64,
+                    },
+                .filter =
+                    {
+                        .width  = 3,
+                        .height = 5,
+                    },
+            },
+        .filter_strides     = {.width = 1, .height = 1},
+        .filter_dilation    = {.width = 1, .height = 1},
+        .input_left_pad     = {.width = 0, .height = 0},
+        .input_right_pad    = {.width = 0, .height = 0},
+        .a_elementwise_op   = {},
+        .b_elementwise_op   = {},
+        .cde_elementwise_op = {},
+    };
+
+    auto inputs    = ckt::alloc_inputs(args);
+    auto outputs   = ckt::alloc_outputs(args);
+    auto reference = ckt::alloc_outputs(args);
+
+    ckt::init_inputs(args, inputs.get());
+
+    auto tile_conv = TileConv{};
+    ckt::run(tile_conv, args, inputs.get(), outputs.get());
+
+    auto ref_conv = Reference{};
+    ckt::run(ref_conv, args, inputs.get(), reference.get());
+
+    EXPECT_THAT(outputs.get(), MatchesReference(args, reference.get()));
+}
diff --git a/experimental/builder/test/unit_validation.cpp b/experimental/builder/test/unit_validation.cpp
index a83d034ac2..7cc9968f21 100644
--- a/experimental/builder/test/unit_validation.cpp
+++ b/experimental/builder/test/unit_validation.cpp
@@ -142,10 +142,12 @@ TEST(ValidationReportTests, MultipleSomeIncorrect)
         auto a = ckt::alloc_tensor_buffer(desc);
         auto b = ckt::alloc_tensor_buffer(desc);
 
-        ckt::fill_tensor_buffer(
-            desc, a.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 100); });
-        ckt::fill_tensor_buffer(
-            desc, b.get(), [](size_t i) { return ck::type_convert<ck::bhalf_t>(i % 101); });
+        ckt::fill_tensor_buffer(desc, a.get(), [](size_t i) {
+            return ck::type_convert<ck::bhalf_t>(static_cast<float>(i % 100));
+        });
+        ckt::fill_tensor_buffer(desc, b.get(), [](size_t i) {
+            return ck::type_convert<ck::bhalf_t>(static_cast<float>(i % 101));
+        });
 
         report.check("incorrect 1", desc, b.get(), a.get());
     }