From 89e943a9f381b0a0e45fae4b59ea306a315aec70 Mon Sep 17 00:00:00 2001 From: JH-Leon-KIM-AMD Date: Mon, 29 Dec 2025 16:11:08 +0200 Subject: [PATCH] [CK_BUILDER] Add GPU Reference Algorithm to CK Builder (#3381) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [CK_BUILDER] Integrate GPU reference as ConvAlgorithm Add GPU reference as a ConvAlgorithm specialization, enabling: - Unified Builder API for reference and optimized kernels - Future ckProfiler integration for validation - First step toward numerical validation in Builder tests Changes: - Add ConvAlgorithmSpecialization::REFERENCE enum - Add ConvAlgorithm_Reference struct - Add IsReferenceAlgorithm concept - Create 3 reference factories (Forward, BwdData, BwdWeight) - Wire into conv_dispatcher - Add proof-of-concept test (passing) Test result: Can instantiate reference through Builder API * Add GPU reference execution tests - Reference kernel executes through Builder (459ms) - Both reference and optimized can instantiate - Tests passing Next: Implement utilities for comparison * Optimized Builder kernel execution works - MakeArgument pattern implemented - Builder-generated kernel executes successfully - Tests passing (451ms execution) Next: Add comparison * VALIDATION COMPLETE: Builder == Reference Builder-generated kernel output matches GPU reference! Test: Validate_Optimized_vs_Reference_Forward_2D_FP16 Result: PASS ✓ This proves CK Builder generates correct code! * Update to new Builder API All tests passing * Rename test file for clarity test_builder_kernel_execution -> test_builder_kernel_validation * Add all 3 directions support - Forward, Backward Data, Backward Weight - All reference factories working - Dispatcher wired for all directions - 9 tests passing Tests: - test_reference_execution: 3 tests (all directions) - test_optimized_execution: 3 tests (all directions) - test_builder_kernel_validation: 3 tests (fwd validated, bwd placeholders) * Add backward direction support - Backward data and weight dispatcher wiring - Fix factories for new API - All 3 directions tested - 9 tests passing * Refactor: Change IsReferenceAlgorithm from concept to consteval function Address review feedback: Use consteval function in dispatcher instead of concept, matching the pattern for other algorithms (Tile, XDL, WMMA, DL). - Remove IsReferenceAlgorithm concept from conv_algorithm_concepts.hpp - Add IsReferenceAlgorithm() consteval function to conv_dispatcher.hpp - Update dispatcher to use function call: IsReferenceAlgorithm() - Remove redundant algorithm checks from reference factory requires clauses All tests passing (9/9). * Move Tile algorithm check outside direction block to support all directions * Implement MakeInvokerPointer interface and add random input validation - Implement full Argument/Invoker structs for old CK interface (not just nullptr) - Refactor with reference_common.hpp to reduce code duplication - Add random input validation tests: Builder vs direct GPU reference (all directions) - Fix layout: GNHWC -> NHWGC to match reference kernel expectations - All 12 tests pass with IDENTICAL results on random input * Move ConvAlgorithm_Reference to test/impl/conv_algorithm_types.hpp Keep types.hpp for data types only (enums), move algorithm descriptors to conv_algorithm_types.hpp as suggested by review. * Add static_assert to ensure reference factories only accept PassThrough operations Reference implementation doesn't support fused elementwise operations. Add compile-time validation to fail early with clear error message if non-PassThrough operations are specified on input, weight, or output. * Add InstanceTraits support for reference kernels - Store SIGNATURE/ALGORITHM/VERSION in Instance for reflection - Create shared ReferenceCommonTraits base for common properties - Add 3 direction-specific InstanceTraits specializations in one file - Include data type and layouts in instance_string output * Remove optimized kernel validation tests from reference-only branch * Use existing layout helper and organize reference tests Use LayoutToCK from conv_tensor_layout.hpp and move reference InstanceTraits test to validation folder. * Merge develop branch Fix DataType switch for new mixed precision types. * Fix comment spacing for CI * Convert IsReferenceAlgorithm from function to concept * Add reference tests to CI smoke tests * Consolidate 3 reference factories into single unified factory --------- Co-authored-by: Ville Pietilä <188998872+vpietila-amd@users.noreply.github.com> [ROCm/composable_kernel commit: a0acc83a72c84a8cdbbdef6f397e617ac040aa72] --- .../builder/factory/conv_dispatcher.hpp | 50 +- .../builder/factory/reference_common.hpp | 118 ++ .../builder/factory/reference_factory.hpp | 249 ++++ .../reflect/instance_traits_reference.hpp | 191 +++ .../builder/include/ck_tile/builder/types.hpp | 3 +- experimental/builder/test/CMakeLists.txt | 35 +- .../test/impl/conv_algorithm_types.hpp | 9 + .../validation/test_reference_execution.cpp | 1031 +++++++++++++++++ .../test_reference_instance_traits.cpp | 117 ++ 9 files changed, 1774 insertions(+), 29 deletions(-) create mode 100644 experimental/builder/include/ck_tile/builder/factory/reference_common.hpp create mode 100644 experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp create mode 100644 experimental/builder/include/ck_tile/builder/reflect/instance_traits_reference.hpp create mode 100644 experimental/builder/test/validation/test_reference_execution.cpp create mode 100644 experimental/builder/test/validation/test_reference_instance_traits.cpp diff --git a/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp b/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp index 99e7479e36..c0dd3d8018 100644 --- a/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp +++ b/experimental/builder/include/ck_tile/builder/factory/conv_dispatcher.hpp @@ -9,10 +9,11 @@ // ## Design Overview // // The dispatcher operates in two phases: -// 1. **Algorithm Identification**: Five `consteval` predicate functions (`IsXdlV3Algorithm`, -// `IsXdlAlgorithm`, `IsWmmaAlgorithm`, `IsDlAlgorithm`, `IsLargeTensorAlgorithm`) inspect -// the algorithm descriptor's structure to determine which kernel variant it satisfies. -// Each predicate checks a specific set of concept constraints that define a kernel variant. +// 1. **Algorithm Identification**: Six `consteval` predicate functions (`IsReferenceAlgorithm`, +// `IsXdlV3Algorithm`, `IsXdlAlgorithm`, `IsWmmaAlgorithm`, `IsDlAlgorithm`, +// `IsLargeTensorAlgorithm`) inspect the algorithm descriptor's structure to determine which +// kernel variant it satisfies. Each predicate checks a specific set of concept constraints +// that define a kernel variant. // // 2. **Factory Routing**: The main `make_conv_instance()` function uses `if constexpr` // to dispatch to the appropriate factory class based on both the convolution direction @@ -21,6 +22,9 @@ // // ## Supported Kernel Variants // +// - **Reference**: Simple reference implementation for validation. Only requires a specialization +// field set to ConvAlgorithmSpecialization::REFERENCE. +// // - **XDL V3**: Newer XDL-based pipeline using block GEMM structure. Requires fewer parameters // than standard XDL (e.g., uses `SpecifiesBlockGemm` instead of scheduling/prefetch configs). // @@ -59,6 +63,7 @@ #include "ck_tile/builder/factory/conv_fwd_wmma_factory.hpp" #include "ck_tile/builder/factory/conv_fwd_dl_factory.hpp" #include "ck_tile/builder/factory/conv_fwd_large_tensor_factory.hpp" +#include "ck_tile/builder/factory/reference_factory.hpp" #include "ck_tile/builder/factory/conv_tile_factory.hpp" namespace ck_tile::builder::factory { @@ -82,6 +87,13 @@ namespace ck_tile::builder::factory { // // TODO: Make this dispatch logic much more robust and clear for users. +// Reference algorithm (simplest implementation for validation) +template +concept IsReferenceAlgorithm = ConvAlgorithmDescriptor && requires { + { T::specialization } -> std::convertible_to; + requires T::specialization == ConvAlgorithmSpecialization::REFERENCE; +}; + // CK Tile kernel template concept IsTileAlgorithm = ConvAlgorithmDescriptor && SpecifiesTileThreadBlock && @@ -132,11 +144,17 @@ constexpr auto make_conv_instance() { using AlgoType = std::remove_const_t; + // Reference algorithm supports all directions + if constexpr(IsReferenceAlgorithm) + { + return typename ReferenceFactory::Instance{}; + } // CK Tile supports common factory for each direction - if constexpr(IsTileAlgorithm) + else if constexpr(IsTileAlgorithm) { return typename ConvTileFactory::Instance{}; } + // Forward direction (supports most algorithm variants) else if constexpr(ConvDirectionIsForward) { if constexpr(IsXdlV3Algorithm) @@ -164,23 +182,25 @@ constexpr auto make_conv_instance() static_assert( false, "No suitable forward convolution kernel factory found for the provided ALGORITHM. " - "The ALGORITHM must satisfy requirements for one of: XDL V3, XDL, WMMA, DL (NHWC " - "layout), or Large Tensor variant."); + "The ALGORITHM must satisfy requirements for one of: Reference, Tile, XDL V3, XDL, " + "WMMA, DL (NHWC layout), or Large Tensor variant."); } } + // Backward data direction (will expand with more algorithms in the future) else if constexpr(ConvDirectionIsBackwardData) { - static_assert( - false, - "Backward data convolution is not yet supported. " - "Only forward convolution (ConvDirection::FORWARD) is currently implemented."); + static_assert(false, + "Backward data convolution: Only reference and tile algorithms supported " + "currently. " + "Optimized kernels (XDL, WMMA, etc.) not yet implemented."); } + // Backward weight direction (will expand with more algorithms in the future) else if constexpr(ConvDirectionIsBackwardWeight) { - static_assert( - false, - "Backward weight convolution is not yet supported. " - "Only forward convolution (ConvDirection::FORWARD) is currently implemented."); + static_assert(false, + "Backward weight convolution: Only reference and tile algorithms " + "supported currently. " + "Optimized kernels (XDL, WMMA, etc.) not yet implemented."); } else { diff --git a/experimental/builder/include/ck_tile/builder/factory/reference_common.hpp b/experimental/builder/include/ck_tile/builder/factory/reference_common.hpp new file mode 100644 index 0000000000..698ed43cb9 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/factory/reference_common.hpp @@ -0,0 +1,118 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/core.hpp" +#include "ck_tile/builder/conv_signature_concepts.hpp" +#include "ck_tile/builder/types.hpp" +#include + +namespace ck_tile::builder::factory::internal { + +// Validation helper: Ensure reference implementation only receives PassThrough elementwise ops +template +consteval void ValidateReferenceSignature() +{ + using namespace ck_tile::builder; + + // Check input elementwise operation + static_assert( + !HasTensorOp || + SIGNATURE.input.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH, + "Reference implementation does not support elementwise operations on input tensor. " + "Input operation must be PassThrough (or not specified)."); + + // Check weight elementwise operation + static_assert( + !HasTensorOp || + SIGNATURE.weight.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH, + "Reference implementation does not support elementwise operations on weight tensor. " + "Weight operation must be PassThrough (or not specified)."); + + // Check output elementwise operation + static_assert( + !HasTensorOp || + SIGNATURE.output.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH, + "Reference implementation does not support elementwise operations on output tensor. " + "Output operation must be PassThrough (or not specified)."); +} + +// Common argument structure for reference convolution implementations +// Template parameters allow different const qualifiers for each direction +template +struct ReferenceConvArgument +{ + InPtrType input_; + WeiPtrType weight_; + OutPtrType output_; + int G_, N_, K_, C_; + std::vector input_spatial_; + std::vector filter_spatial_; + std::vector output_spatial_; + std::vector strides_; + std::vector dilations_; + std::vector left_pads_; + + ReferenceConvArgument(InPtrType input, + WeiPtrType weight, + OutPtrType output, + int G, + int N, + int K, + int C, + const std::vector& input_spatial, + const std::vector& filter_spatial, + const std::vector& output_spatial, + const std::vector& strides, + const std::vector& dilations, + const std::vector& left_pads) + : input_(input), + weight_(weight), + output_(output), + G_(G), + N_(N), + K_(K), + C_(C), + input_spatial_(input_spatial), + filter_spatial_(filter_spatial), + output_spatial_(output_spatial), + strides_(strides), + dilations_(dilations), + left_pads_(left_pads) + { + } +}; + +// Common invoker structure for reference convolution implementations +// Takes a callable (lambda or function pointer) to execute the actual convolution +template +struct ReferenceConvInvoker +{ + ConvFunc conv_func_; + + explicit ReferenceConvInvoker(ConvFunc func) : conv_func_(func) {} + + float Run(const ArgumentType* arg, const StreamConfig& stream_config = StreamConfig{}) + { + (void)stream_config; // Unused for reference implementation + + conv_func_(arg->input_, + arg->weight_, + arg->output_, + arg->G_, + arg->N_, + arg->K_, + arg->C_, + arg->input_spatial_, + arg->filter_spatial_, + arg->output_spatial_, + arg->strides_, + arg->dilations_, + arg->left_pads_); + + return 0.0f; // Reference implementation doesn't track timing + } +}; + +} // namespace ck_tile::builder::factory::internal diff --git a/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp b/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp new file mode 100644 index 0000000000..0246c805c2 --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/factory/reference_factory.hpp @@ -0,0 +1,249 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#pragma once + +#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp" +#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp" +#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp" +#include "ck_tile/builder/conv_signature_concepts.hpp" +#include "ck_tile/builder/conv_algorithm_concepts.hpp" +#include "ck_tile/builder/types.hpp" +#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp" +#include "ck_tile/builder/factory/reference_common.hpp" +#include "ck_tile/core.hpp" +#include + +namespace ck_tile::builder::factory { + +// Unified Factory for GPU Reference Convolution (all directions) +template +struct ReferenceFactory +{ + // Validate that only PassThrough elementwise operations are specified + static constexpr auto kValidation = (internal::ValidateReferenceSignature(), 0); + + static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim; + using Types = internal::FwdConvTensorDataTypes; + + using InDataType = typename Types::ADataType; + using WeiDataType = typename Types::BDataType; + using OutDataType = typename Types::EDataType; + + struct Instance + { + // Store template parameters for InstanceTraits reflection + static constexpr auto kSignature = SIGNATURE; + static constexpr auto kAlgorithm = ALGORITHM; + static constexpr auto kVersion = VERSION; + + // Argument and Invoker types depend on direction + // Forward: const input, const weight, mutable output + // Backward Data: mutable input, const weight, const output_grad + // Backward Weight: const input, mutable weight_grad, const output_grad + + // Use appropriate Argument type based on direction + using Argument = std::conditional_t< + ConvDirectionIsForward, + internal::ReferenceConvArgument, + std::conditional_t< + ConvDirectionIsBackwardData, + internal:: + ReferenceConvArgument, + internal:: + ReferenceConvArgument>>; + + // Invoker calls the appropriate reference implementation based on direction + struct Invoker + { + float Run(const Argument* arg, const StreamConfig& stream_config = StreamConfig{}) + { + (void)stream_config; // Unused for reference implementation + + if constexpr(ConvDirectionIsForward) + { + ck_tile:: + naive_grouped_conv_fwd( + arg->input_, + arg->weight_, + arg->output_, + arg->G_, + arg->N_, + arg->K_, + arg->C_, + arg->input_spatial_, + arg->filter_spatial_, + arg->output_spatial_, + arg->strides_, + arg->dilations_, + arg->left_pads_); + } + else if constexpr(ConvDirectionIsBackwardData) + { + ck_tile::naive_grouped_conv_bwd_data(arg->input_, + arg->weight_, + arg->output_, + arg->G_, + arg->N_, + arg->K_, + arg->C_, + arg->input_spatial_, + arg->filter_spatial_, + arg->output_spatial_, + arg->strides_, + arg->dilations_, + arg->left_pads_); + } + else if constexpr(ConvDirectionIsBackwardWeight) + { + ck_tile::naive_grouped_conv_bwd_weight(arg->input_, + arg->weight_, + arg->output_, + arg->G_, + arg->N_, + arg->K_, + arg->C_, + arg->input_spatial_, + arg->filter_spatial_, + arg->output_spatial_, + arg->strides_, + arg->dilations_, + arg->left_pads_); + } + + return 0.0f; // Reference implementation doesn't track timing + } + }; + + // Direct Run method (simpler interface, direction-agnostic) + template + static void Run(InPtrType input, + WeiPtrType weight, + OutPtrType output, + int G, + int N, + int K, + int C, + const std::vector& input_spatial, + const std::vector& filter_spatial, + const std::vector& output_spatial, + const std::vector& strides, + const std::vector& dilations, + const std::vector& left_pads) + { + if constexpr(ConvDirectionIsForward) + { + ck_tile::naive_grouped_conv_fwd( + input, + weight, + output, + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + } + else if constexpr(ConvDirectionIsBackwardData) + { + ck_tile:: + naive_grouped_conv_bwd_data( + input, + weight, + output, + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + } + else if constexpr(ConvDirectionIsBackwardWeight) + { + ck_tile::naive_grouped_conv_bwd_weight(input, + weight, + output, + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + } + } + + std::string GetTypeString() const + { + std::string dir_str; + if constexpr(ConvDirectionIsForward) + dir_str = "Forward"; + else if constexpr(ConvDirectionIsBackwardData) + dir_str = "BackwardData"; + else if constexpr(ConvDirectionIsBackwardWeight) + dir_str = "BackwardWeight"; + + return std::string("GPU_Reference_") + dir_str + "_" + std::to_string(SPATIAL_DIM) + + "D"; + } + + // Old CK interface: Create argument pointer + template + std::unique_ptr + MakeArgumentPointer(InPtrType input, + WeiPtrType weight, + OutPtrType output, + int G, + int N, + int K, + int C, + const std::vector& input_spatial, + const std::vector& filter_spatial, + const std::vector& output_spatial, + const std::vector& strides, + const std::vector& dilations, + const std::vector& left_pads) const + { + return std::make_unique(input, + weight, + output, + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + } + + // Old CK interface: Create invoker pointer + std::unique_ptr MakeInvokerPointer() const { return std::make_unique(); } + }; +}; + +} // namespace ck_tile::builder::factory diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_reference.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_reference.hpp new file mode 100644 index 0000000000..b2e8bb6a7c --- /dev/null +++ b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_reference.hpp @@ -0,0 +1,191 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +// InstanceTraits specializations for Reference convolution kernels +// +// This file provides compile-time reflection for all three reference kernel directions +// (Forward, Backward Data, Backward Weight) using a shared base to reduce duplication. + +#pragma once + +#include "instance_traits.hpp" +#include "instance_traits_util.hpp" +#include "ck_tile/builder/factory/reference_factory.hpp" +#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp" +#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp" +#include + +namespace ck_tile::reflect { + +namespace internal { + +// Common traits shared by all reference implementations +template +struct ReferenceCommonTraits +{ + // Spatial dimension + static constexpr int kSpatialDim = SIGNATURE.spatial_dim; + + // Layouts - map from enum to type using existing helper + using InLayout = + typename builder::factory::internal::LayoutToCK::type; + using WeiLayout = + typename builder::factory::internal::LayoutToCK::type; + using OutLayout = + typename builder::factory::internal::LayoutToCK::type; + + // Data types - extract from factory's type helper + using Types = builder::factory::internal::FwdConvTensorDataTypes; + using ADataType = typename Types::ADataType; + using BDataType = typename Types::BDataType; + using EDataType = typename Types::EDataType; + using AccDataType = float; // Reference uses float accumulation + + // Elementwise operations - reference only supports PassThrough + using AElementwiseOperation = ck_tile::element_wise::PassThrough; + using BElementwiseOperation = ck_tile::element_wise::PassThrough; + using CDEElementwiseOperation = ck_tile::element_wise::PassThrough; + + // Reference has no block/tile configuration (simple kernel) + // These are set to 0 to indicate "not applicable" + static constexpr int kBlockSize = 0; + static constexpr int kMPerBlock = 0; + static constexpr int kNPerBlock = 0; + static constexpr int kKPerBlock = 0; +}; + +} // namespace internal + +// ============================================================================ +// InstanceTraits specialization for Reference Forward Convolution +// ============================================================================ +template + requires( + std::is_same_v, + builder::ConvAlgorithmSpecialization> && + (Instance::kAlgorithm.specialization == builder::ConvAlgorithmSpecialization::REFERENCE) && + builder::ConvDirectionIsForward) +struct InstanceTraits : internal::ReferenceCommonTraits +{ + using Base = internal::ReferenceCommonTraits; + + // Bring base class members into scope + using Base::kBlockSize; + using Base::kKPerBlock; + using Base::kMPerBlock; + using Base::kNPerBlock; + using Base::kSpatialDim; + using typename Base::AccDataType; + using typename Base::ADataType; + using typename Base::AElementwiseOperation; + using typename Base::BDataType; + using typename Base::BElementwiseOperation; + using typename Base::CDEElementwiseOperation; + using typename Base::EDataType; + using typename Base::InLayout; + using typename Base::OutLayout; + using typename Base::WeiLayout; + + static constexpr builder::ConvDirection direction = builder::ConvDirection::FORWARD; + + static std::string instance_string() + { + std::ostringstream oss; + oss << "GPU_Reference_Forward_" << kSpatialDim << "D"; + oss << "_" << detail::type_name(); + oss << "_" << detail::layout_name(); + oss << "_" << detail::layout_name(); + oss << "_" << detail::layout_name(); + return oss.str(); + } +}; + +// ============================================================================ +// InstanceTraits specialization for Reference Backward Data Convolution +// ============================================================================ +template + requires( + std::is_same_v, + builder::ConvAlgorithmSpecialization> && + (Instance::kAlgorithm.specialization == builder::ConvAlgorithmSpecialization::REFERENCE) && + builder::ConvDirectionIsBackwardData) +struct InstanceTraits : internal::ReferenceCommonTraits +{ + using Base = internal::ReferenceCommonTraits; + + // Bring base class members into scope + using Base::kBlockSize; + using Base::kKPerBlock; + using Base::kMPerBlock; + using Base::kNPerBlock; + using Base::kSpatialDim; + using typename Base::AccDataType; + using typename Base::ADataType; + using typename Base::AElementwiseOperation; + using typename Base::BDataType; + using typename Base::BElementwiseOperation; + using typename Base::CDEElementwiseOperation; + using typename Base::EDataType; + using typename Base::InLayout; + using typename Base::OutLayout; + using typename Base::WeiLayout; + + static constexpr builder::ConvDirection direction = builder::ConvDirection::BACKWARD_DATA; + + static std::string instance_string() + { + std::ostringstream oss; + oss << "GPU_Reference_BackwardData_" << kSpatialDim << "D"; + oss << "_" << detail::type_name(); + oss << "_" << detail::layout_name(); + oss << "_" << detail::layout_name(); + oss << "_" << detail::layout_name(); + return oss.str(); + } +}; + +// ============================================================================ +// InstanceTraits specialization for Reference Backward Weight Convolution +// ============================================================================ +template + requires( + std::is_same_v, + builder::ConvAlgorithmSpecialization> && + (Instance::kAlgorithm.specialization == builder::ConvAlgorithmSpecialization::REFERENCE) && + builder::ConvDirectionIsBackwardWeight) +struct InstanceTraits : internal::ReferenceCommonTraits +{ + using Base = internal::ReferenceCommonTraits; + + // Bring base class members into scope + using Base::kBlockSize; + using Base::kKPerBlock; + using Base::kMPerBlock; + using Base::kNPerBlock; + using Base::kSpatialDim; + using typename Base::AccDataType; + using typename Base::ADataType; + using typename Base::AElementwiseOperation; + using typename Base::BDataType; + using typename Base::BElementwiseOperation; + using typename Base::CDEElementwiseOperation; + using typename Base::EDataType; + using typename Base::InLayout; + using typename Base::OutLayout; + using typename Base::WeiLayout; + + static constexpr builder::ConvDirection direction = builder::ConvDirection::BACKWARD_WEIGHT; + + static std::string instance_string() + { + std::ostringstream oss; + oss << "GPU_Reference_BackwardWeight_" << kSpatialDim << "D"; + oss << "_" << detail::type_name(); + oss << "_" << detail::layout_name(); + oss << "_" << detail::layout_name(); + oss << "_" << detail::layout_name(); + return oss.str(); + } +}; + +} // namespace ck_tile::reflect diff --git a/experimental/builder/include/ck_tile/builder/types.hpp b/experimental/builder/include/ck_tile/builder/types.hpp index f7386720b3..c1c62e91fa 100644 --- a/experimental/builder/include/ck_tile/builder/types.hpp +++ b/experimental/builder/include/ck_tile/builder/types.hpp @@ -248,7 +248,8 @@ enum class PipelineScheduler enum class ConvAlgorithmSpecialization { - LARGE_TENSOR + LARGE_TENSOR, + REFERENCE // GPU reference implementation for validation }; // toString methods for enum classes diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt index eb4ef13462..424bfd8781 100644 --- a/experimental/builder/test/CMakeLists.txt +++ b/experimental/builder/test/CMakeLists.txt @@ -84,21 +84,29 @@ add_ck_builder_test(test_ckb_conv_builder unit_conv_tensor_layout.cpp unit_conv_tensor_type.cpp unit_conv_thread_block.cpp - unit_conv_tuning_params.cpp - unit_conv_fwd_testing.cpp) -target_link_libraries(test_ckb_conv_builder PRIVATE utility) + unit_conv_tuning_params.cpp) + + # Tests the inline diff utility used for comparing strings in tests assertions + add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp) -# Tests the inline diff utility used for comparing strings in tests assertions -add_ck_builder_test(test_ckb_inline_diff test_inline_diff.cpp) - -# Tests convolution trait selection and configuration -add_ck_builder_test(test_ckb_conv_traits - conv/ck/test_conv_traits.cpp) - -# Tests convolution problem description and parameter handling -add_ck_builder_test(test_ckb_conv_description - test_conv_description.cpp) + # GPU reference validation tests (in validation/ folder) + # 1. Reference kernel execution and InstanceTraits + add_ck_builder_test(test_ckb_reference_execution + validation/test_reference_execution.cpp + validation/test_reference_instance_traits.cpp) + target_link_libraries(test_ckb_reference_execution PRIVATE utility) + + # Note: Optimized kernel validation tests will be added after merging dev branch + # with kernel Run() implementation from colleague's work + # Tests convolution trait selection and configuration + add_ck_builder_test(test_ckb_conv_traits + conv/ck/test_conv_traits.cpp) + + # Tests convolution problem description and parameter handling + add_ck_builder_test(test_ckb_conv_description + test_conv_description.cpp) + ################################################################################ # REGRESSION TESTS - Integration Tests (With Kernel Compilation) ################################################################################ @@ -181,6 +189,7 @@ set(CKB_SMOKE_TESTS test_ckb_inline_diff test_ckb_conv_traits test_ckb_conv_description + test_ckb_reference_execution ) foreach(test_target ${CKB_SMOKE_TESTS}) diff --git a/experimental/builder/test/impl/conv_algorithm_types.hpp b/experimental/builder/test/impl/conv_algorithm_types.hpp index 29c7f3cdcc..bf61eb7026 100644 --- a/experimental/builder/test/impl/conv_algorithm_types.hpp +++ b/experimental/builder/test/impl/conv_algorithm_types.hpp @@ -479,4 +479,13 @@ using ConvAlgorithm_Tile_GroupedConvolutionKernel = ConvAlgorithmTemplate; +// Reference algorithm descriptor - for GPU reference validation +// This is a simple algorithm that requires no complex configuration, +// just a specialization marker to identify it as a reference implementation. +struct ConvAlgorithm_Reference +{ + static constexpr auto specialization = ckb::ConvAlgorithmSpecialization::REFERENCE; + // GPU reference uses simple algorithm, no tile configuration needed +}; + } // namespace ck_tile::builder::test diff --git a/experimental/builder/test/validation/test_reference_execution.cpp b/experimental/builder/test/validation/test_reference_execution.cpp new file mode 100644 index 0000000000..29f9acacd3 --- /dev/null +++ b/experimental/builder/test/validation/test_reference_execution.cpp @@ -0,0 +1,1031 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +#include "ck_tile/builder/conv_builder.hpp" +#include "ck_tile/builder/types.hpp" +#include "impl/conv_algorithm_types.hpp" +#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp" +#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp" +#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp" +#include "utils/ckb_conv_test_configs.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/check_err.hpp" +#include +#include + +namespace { + +using namespace ck_tile::builder; +using namespace ck_tile::builder::test; // For ConvAlgorithm_Reference +using namespace ck_tile::builder::test_utils; + +TEST(ReferenceExecution, Forward_2D_FP16) +{ + // Note: When you don't specify .operation, it defaults to PassThrough + // Reference implementation only supports PassThrough elementwise operations + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::GNHWC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::GNHWK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + // Simple dimensions + const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3; + + // Allocate minimal device memory (just to test API) + const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_size = G * N * K * H * W * sizeof(ck::half_t); + + ck::DeviceMem in_dev(in_size); + ck::DeviceMem wei_dev(wei_size); + ck::DeviceMem out_dev(out_size); + + in_dev.SetZero(); + wei_dev.SetZero(); + out_dev.SetZero(); + + // Prepare parameters for Run() + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel ref_kernel; + ref_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // If we get here, Run() worked! + std::cout << "✓ Reference Forward kernel executed!" << std::endl; + EXPECT_TRUE(true); +} + +TEST(ReferenceExecution, BackwardData_2D_FP16) +{ + // Note: When you don't specify .operation, it defaults to PassThrough + // Reference implementation only supports PassThrough elementwise operations + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_DATA, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::GNHWC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::GNHWK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3; + + const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); + + ck::DeviceMem in_grad_dev(in_grad_size); + ck::DeviceMem wei_dev(wei_size); + ck::DeviceMem out_grad_dev(out_grad_size); + + in_grad_dev.SetZero(); + wei_dev.SetZero(); + out_grad_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel ref_kernel; + ref_kernel.Run(reinterpret_cast(in_grad_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + std::cout << "✓ Reference Backward Data kernel executed!" << std::endl; + EXPECT_TRUE(true); +} + +TEST(ReferenceExecution, BackwardWeight_2D_FP16) +{ + // Note: When you don't specify .operation, it defaults to PassThrough + // Reference implementation only supports PassThrough elementwise operations + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_WEIGHT, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::GNHWC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::GNHWK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3; + + const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); + + ck::DeviceMem in_dev(in_size); + ck::DeviceMem wei_grad_dev(wei_grad_size); + ck::DeviceMem out_grad_dev(out_grad_size); + + in_dev.SetZero(); + wei_grad_dev.SetZero(); + out_grad_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel ref_kernel; + ref_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_grad_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + std::cout << "✓ Reference Backward Weight kernel executed!" << std::endl; + EXPECT_TRUE(true); +} + +// Test the old CK interface: MakeArgumentPointer + MakeInvokerPointer +TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_DATA, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3; + + const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); + + ck::DeviceMem in_grad_dev(in_grad_size); + ck::DeviceMem wei_dev(wei_size); + ck::DeviceMem out_grad_dev(out_grad_size); + + in_grad_dev.SetZero(); + wei_dev.SetZero(); + out_grad_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel ref_kernel; + + // TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer) + auto argument_ptr = ref_kernel.MakeArgumentPointer( + reinterpret_cast(in_grad_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + auto invoker_ptr = ref_kernel.MakeInvokerPointer(); + + // Run using invoker + float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + + std::cout << "✓ Reference Backward Data kernel executed via Invoker interface!" << std::endl; + std::cout << " (time = " << time << " ms)" << std::endl; + EXPECT_TRUE(true); +} + +// Test the old CK interface for Forward convolution +TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::GNHWC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::GNHWK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3; + + const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_size = G * N * K * H * W * sizeof(ck::half_t); + + ck::DeviceMem in_dev(in_size); + ck::DeviceMem wei_dev(wei_size); + ck::DeviceMem out_dev(out_size); + + in_dev.SetZero(); + wei_dev.SetZero(); + out_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel ref_kernel; + + // TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer) + auto argument_ptr = ref_kernel.MakeArgumentPointer( + reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + auto invoker_ptr = ref_kernel.MakeInvokerPointer(); + + // Run using invoker + float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + + std::cout << "✓ Reference Forward kernel executed via Invoker interface!" << std::endl; + std::cout << " (time = " << time << " ms)" << std::endl; + EXPECT_TRUE(true); +} + +// Test the old CK interface for Backward Weight convolution +TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_WEIGHT, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::GNHWC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::GNHWK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3; + + const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); + + ck::DeviceMem in_dev(in_size); + ck::DeviceMem wei_grad_dev(wei_grad_size); + ck::DeviceMem out_grad_dev(out_grad_size); + + in_dev.SetZero(); + wei_grad_dev.SetZero(); + out_grad_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel ref_kernel; + + // TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer) + auto argument_ptr = ref_kernel.MakeArgumentPointer( + reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_grad_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + auto invoker_ptr = ref_kernel.MakeInvokerPointer(); + + // Run using invoker + float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + + std::cout << "✓ Reference Backward Weight kernel executed via Invoker interface!" << std::endl; + std::cout << " (time = " << time << " ms)" << std::endl; + EXPECT_TRUE(true); +} + +// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT +TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; // Small for fast testing + + const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_size = G * N * K * H * W * sizeof(ck::half_t); + + // Create host buffers with random data + const size_t in_elements = G * N * C * H * W; + const size_t wei_elements = G * K * C * 3 * 3; + const size_t out_elements = G * N * K * H * W; + + std::vector in_host(in_elements); + std::vector wei_host(wei_elements); + + // Fill with random values + std::srand(12345); // Fixed seed for reproducibility + for(size_t i = 0; i < in_elements; i++) + { + in_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + for(size_t i = 0; i < wei_elements; i++) + { + wei_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + + // Allocate GPU memory + ck::DeviceMem in_dev(in_size); + ck::DeviceMem wei_dev(wei_size); + ck::DeviceMem out_builder_dev(out_size); + ck::DeviceMem out_naive_dev(out_size); + + // Transfer random data to GPU + in_dev.ToDevice(in_host.data()); + wei_dev.ToDevice(wei_host.data()); + out_builder_dev.SetZero(); + out_naive_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel builder_kernel; + + // Run 1: Builder Reference Factory + builder_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_builder_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // Run 2: Direct GPU Reference (same kernel the Builder calls internally!) + ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>( + reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_naive_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // Copy results back + std::vector out_builder_result(out_elements); + std::vector out_naive_result(out_elements); + out_builder_dev.FromDevice(out_builder_result.data()); + out_naive_dev.FromDevice(out_naive_result.data()); + + // Compare - should be IDENTICAL (both call same kernel) + bool pass = ck::utils::check_err(out_builder_result, + out_naive_result, + "Error: Builder Reference != Direct GPU Reference", + 1e-6, + 1e-6); // Very tight tolerance! + + std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT)!" << std::endl; + std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; + std::cout << " This validates Builder Reference Factory is correct!" << std::endl; + + EXPECT_TRUE(pass); +} + +// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Data +TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Random) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_DATA, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; + + const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); + + const size_t in_grad_elements = G * N * C * H * W; + const size_t wei_elements = G * K * C * 3 * 3; + const size_t out_grad_elements = G * N * K * H * W; + + std::vector wei_host(wei_elements); + std::vector out_grad_host(out_grad_elements); + + // Fill with random values + std::srand(12346); + for(size_t i = 0; i < wei_elements; i++) + { + wei_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + for(size_t i = 0; i < out_grad_elements; i++) + { + out_grad_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + + ck::DeviceMem in_grad_builder_dev(in_grad_size); + ck::DeviceMem in_grad_naive_dev(in_grad_size); + ck::DeviceMem wei_dev(wei_size); + ck::DeviceMem out_grad_dev(out_grad_size); + + wei_dev.ToDevice(wei_host.data()); + out_grad_dev.ToDevice(out_grad_host.data()); + in_grad_builder_dev.SetZero(); + in_grad_naive_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel builder_kernel; + + // Run 1: Builder Reference Factory + builder_kernel.Run(reinterpret_cast(in_grad_builder_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // Run 2: Direct GPU Reference + ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>( + reinterpret_cast(in_grad_naive_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // Compare + std::vector in_grad_builder_result(in_grad_elements); + std::vector in_grad_naive_result(in_grad_elements); + in_grad_builder_dev.FromDevice(in_grad_builder_result.data()); + in_grad_naive_dev.FromDevice(in_grad_naive_result.data()); + + bool pass = ck::utils::check_err(in_grad_builder_result, + in_grad_naive_result, + "Error: Builder Backward Data != Direct GPU Reference", + 1e-6, + 1e-6); + + std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Data)!" + << std::endl; + std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; + EXPECT_TRUE(pass); +} + +// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Weight +TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Random) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_WEIGHT, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; + + const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); + + const size_t in_elements = G * N * C * H * W; + const size_t wei_grad_elements = G * K * C * 3 * 3; + const size_t out_grad_elements = G * N * K * H * W; + + std::vector in_host(in_elements); + std::vector out_grad_host(out_grad_elements); + + // Fill with random values + std::srand(12347); + for(size_t i = 0; i < in_elements; i++) + { + in_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + for(size_t i = 0; i < out_grad_elements; i++) + { + out_grad_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + + ck::DeviceMem in_dev(in_size); + ck::DeviceMem wei_grad_builder_dev(wei_grad_size); + ck::DeviceMem wei_grad_naive_dev(wei_grad_size); + ck::DeviceMem out_grad_dev(out_grad_size); + + in_dev.ToDevice(in_host.data()); + out_grad_dev.ToDevice(out_grad_host.data()); + wei_grad_builder_dev.SetZero(); + wei_grad_naive_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel builder_kernel; + + // Run 1: Builder Reference Factory + builder_kernel.Run(reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_grad_builder_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // Run 2: Direct GPU Reference + ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>( + reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_grad_naive_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // Compare + std::vector wei_grad_builder_result(wei_grad_elements); + std::vector wei_grad_naive_result(wei_grad_elements); + wei_grad_builder_dev.FromDevice(wei_grad_builder_result.data()); + wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data()); + + bool pass = ck::utils::check_err(wei_grad_builder_result, + wei_grad_naive_result, + "Error: Builder Backward Weight != Direct GPU Reference", + 1e-6, + 1e-6); + + std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Weight)!" + << std::endl; + std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; + EXPECT_TRUE(pass); +} + +// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Forward +TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; + + const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_size = G * N * K * H * W * sizeof(ck::half_t); + + const size_t in_elements = G * N * C * H * W; + const size_t wei_elements = G * K * C * 3 * 3; + const size_t out_elements = G * N * K * H * W; + + std::vector in_host(in_elements); + std::vector wei_host(wei_elements); + + std::srand(12348); + for(size_t i = 0; i < in_elements; i++) + { + in_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + for(size_t i = 0; i < wei_elements; i++) + { + wei_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + + ck::DeviceMem in_dev(in_size); + ck::DeviceMem wei_dev(wei_size); + ck::DeviceMem out_invoker_dev(out_size); + ck::DeviceMem out_naive_dev(out_size); + + in_dev.ToDevice(in_host.data()); + wei_dev.ToDevice(wei_host.data()); + out_invoker_dev.SetZero(); + out_naive_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel builder_kernel; + + // Run 1: Builder Invoker Interface + auto argument_ptr = builder_kernel.MakeArgumentPointer( + reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_invoker_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + auto invoker_ptr = builder_kernel.MakeInvokerPointer(); + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + + // Run 2: Direct GPU Reference + ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>( + reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_naive_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // Compare + std::vector out_invoker_result(out_elements); + std::vector out_naive_result(out_elements); + out_invoker_dev.FromDevice(out_invoker_result.data()); + out_naive_dev.FromDevice(out_naive_result.data()); + + bool pass = ck::utils::check_err(out_invoker_result, + out_naive_result, + "Error: Invoker Interface != Direct GPU Reference", + 1e-6, + 1e-6); + + std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Forward)!" << std::endl; + std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; + EXPECT_TRUE(pass); +} + +// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Data +TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_DATA, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; + + const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); + + const size_t in_grad_elements = G * N * C * H * W; + const size_t wei_elements = G * K * C * 3 * 3; + const size_t out_grad_elements = G * N * K * H * W; + + std::vector wei_host(wei_elements); + std::vector out_grad_host(out_grad_elements); + + std::srand(12349); + for(size_t i = 0; i < wei_elements; i++) + { + wei_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + for(size_t i = 0; i < out_grad_elements; i++) + { + out_grad_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + + ck::DeviceMem in_grad_invoker_dev(in_grad_size); + ck::DeviceMem in_grad_naive_dev(in_grad_size); + ck::DeviceMem wei_dev(wei_size); + ck::DeviceMem out_grad_dev(out_grad_size); + + wei_dev.ToDevice(wei_host.data()); + out_grad_dev.ToDevice(out_grad_host.data()); + in_grad_invoker_dev.SetZero(); + in_grad_naive_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel builder_kernel; + + // Run 1: Builder Invoker Interface + auto argument_ptr = builder_kernel.MakeArgumentPointer( + reinterpret_cast(in_grad_invoker_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + auto invoker_ptr = builder_kernel.MakeInvokerPointer(); + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + + // Run 2: Direct GPU Reference + ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>( + reinterpret_cast(in_grad_naive_dev.GetDeviceBuffer()), + reinterpret_cast(wei_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // Compare + std::vector in_grad_invoker_result(in_grad_elements); + std::vector in_grad_naive_result(in_grad_elements); + in_grad_invoker_dev.FromDevice(in_grad_invoker_result.data()); + in_grad_naive_dev.FromDevice(in_grad_naive_result.data()); + + bool pass = + ck::utils::check_err(in_grad_invoker_result, + in_grad_naive_result, + "Error: Invoker Interface != Direct GPU Reference (Backward Data)", + 1e-6, + 1e-6); + + std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Data)!" + << std::endl; + std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; + EXPECT_TRUE(pass); +} + +// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Weight +TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_WEIGHT, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14; + + const size_t in_size = G * N * C * H * W * sizeof(ck::half_t); + const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t); + const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t); + + const size_t in_elements = G * N * C * H * W; + const size_t wei_grad_elements = G * K * C * 3 * 3; + const size_t out_grad_elements = G * N * K * H * W; + + std::vector in_host(in_elements); + std::vector out_grad_host(out_grad_elements); + + std::srand(12350); + for(size_t i = 0; i < in_elements; i++) + { + in_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + for(size_t i = 0; i < out_grad_elements; i++) + { + out_grad_host[i] = ck::half_t(static_cast(std::rand()) / RAND_MAX * 2.0f - 1.0f); + } + + ck::DeviceMem in_dev(in_size); + ck::DeviceMem wei_grad_invoker_dev(wei_grad_size); + ck::DeviceMem wei_grad_naive_dev(wei_grad_size); + ck::DeviceMem out_grad_dev(out_grad_size); + + in_dev.ToDevice(in_host.data()); + out_grad_dev.ToDevice(out_grad_host.data()); + wei_grad_invoker_dev.SetZero(); + wei_grad_naive_dev.SetZero(); + + std::vector input_spatial{H, W}; + std::vector filter_spatial{3, 3}; + std::vector output_spatial{H, W}; + std::vector strides{1, 1}; + std::vector dilations{1, 1}; + std::vector left_pads{1, 1}; + + RefKernel builder_kernel; + + // Run 1: Builder Invoker Interface + auto argument_ptr = builder_kernel.MakeArgumentPointer( + reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_grad_invoker_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + auto invoker_ptr = builder_kernel.MakeInvokerPointer(); + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + + // Run 2: Direct GPU Reference + ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>( + reinterpret_cast(in_dev.GetDeviceBuffer()), + reinterpret_cast(wei_grad_naive_dev.GetDeviceBuffer()), + reinterpret_cast(out_grad_dev.GetDeviceBuffer()), + G, + N, + K, + C, + input_spatial, + filter_spatial, + output_spatial, + strides, + dilations, + left_pads); + + // Compare + std::vector wei_grad_invoker_result(wei_grad_elements); + std::vector wei_grad_naive_result(wei_grad_elements); + wei_grad_invoker_dev.FromDevice(wei_grad_invoker_result.data()); + wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data()); + + bool pass = + ck::utils::check_err(wei_grad_invoker_result, + wei_grad_naive_result, + "Error: Invoker Interface != Direct GPU Reference (Backward Weight)", + 1e-6, + 1e-6); + + std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Weight)!" + << std::endl; + std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl; + EXPECT_TRUE(pass); +} + +} // namespace diff --git a/experimental/builder/test/validation/test_reference_instance_traits.cpp b/experimental/builder/test/validation/test_reference_instance_traits.cpp new file mode 100644 index 0000000000..3e79d51ac7 --- /dev/null +++ b/experimental/builder/test/validation/test_reference_instance_traits.cpp @@ -0,0 +1,117 @@ +// Copyright (c) Advanced Micro Devices, Inc., or its affiliates. +// SPDX-License-Identifier: MIT + +// Test: Verify InstanceTraits works for Reference kernels + +#include "ck_tile/builder/conv_builder.hpp" +#include "ck_tile/builder/types.hpp" +#include "ck_tile/builder/reflect/instance_traits_reference.hpp" +#include "impl/conv_algorithm_types.hpp" +#include "impl/conv_signature_types.hpp" +#include + +namespace { + +using namespace ck_tile::builder; +using namespace ck_tile::builder::test; + +TEST(ReferenceInstanceTraits, Forward_2D_FP16) +{ + // Create a reference forward kernel + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::FORWARD, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + // Use InstanceTraits to query properties + using Traits = ck_tile::reflect::InstanceTraits; + + // Verify spatial dimension + EXPECT_EQ(Traits::kSpatialDim, 2); + + // Verify direction + EXPECT_EQ(Traits::direction, ConvDirection::FORWARD); + + // Verify data types + EXPECT_TRUE((std::is_same_v)); + EXPECT_TRUE((std::is_same_v)); + EXPECT_TRUE((std::is_same_v)); + + // Verify layouts + EXPECT_TRUE((std::is_same_v)); + EXPECT_TRUE((std::is_same_v)); + EXPECT_TRUE((std::is_same_v)); + + // Verify elementwise operations (always PassThrough for reference) + EXPECT_TRUE( + (std::is_same_v)); + EXPECT_TRUE( + (std::is_same_v)); + EXPECT_TRUE( + (std::is_same_v)); + + // Verify block size is 0 (N/A for reference) + EXPECT_EQ(Traits::kBlockSize, 0); + + // Verify instance_string() - now includes data type and layouts! + std::string instance_str = Traits::instance_string(); + EXPECT_EQ(instance_str, "GPU_Reference_Forward_2D_fp16_NHWGC_GKYXC_NHWGK"); + + std::cout << "✓ Forward InstanceTraits validated: " << instance_str << std::endl; +} + +TEST(ReferenceInstanceTraits, BackwardData_2D_FP16) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_DATA, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + using Traits = ck_tile::reflect::InstanceTraits; + + EXPECT_EQ(Traits::kSpatialDim, 2); + EXPECT_EQ(Traits::direction, ConvDirection::BACKWARD_DATA); + + std::string instance_str = Traits::instance_string(); + EXPECT_EQ(instance_str, "GPU_Reference_BackwardData_2D_fp16_NHWGC_GKYXC_NHWGK"); + + std::cout << "✓ Backward Data InstanceTraits validated: " << instance_str << std::endl; +} + +TEST(ReferenceInstanceTraits, BackwardWeight_2D_FP16) +{ + constexpr ConvSignature sig{.spatial_dim = 2, + .direction = ConvDirection::BACKWARD_WEIGHT, + .data_type = DataType::FP16, + .accumulation_data_type = DataType::FP32, + .input = {.config = {.layout = TensorLayout::NHWGC}}, + .weight = {.config = {.layout = TensorLayout::GKYXC}}, + .output = {.config = {.layout = TensorLayout::NHWGK}}}; + + constexpr auto ref_alg = ConvAlgorithm_Reference{}; + using RefKernel = ConvBuilder::Instance; + + using Traits = ck_tile::reflect::InstanceTraits; + + EXPECT_EQ(Traits::kSpatialDim, 2); + EXPECT_EQ(Traits::direction, ConvDirection::BACKWARD_WEIGHT); + + std::string instance_str = Traits::instance_string(); + EXPECT_EQ(instance_str, "GPU_Reference_BackwardWeight_2D_fp16_NHWGC_GKYXC_NHWGK"); + + std::cout << "✓ Backward Weight InstanceTraits validated: " << instance_str << std::endl; +} + +} // namespace