[CK_BUILDER] Replace reference conv with old ck implementation (#3604)

* ck-builder: remove SPATIAL_DIM parameter from ConvTensorLayouts

This information is already in the SIGNATURE, so its pointless to pass it
separately. This streamlines the interface of those functions a bit. Also
touches up the style of those files in general.

* ck-builder: implement reference conv using old ck

The old ck implementation is more featureful and better tested.

* ck-builder: replace test_reference_execution reference with old ck

This strips out the ck-tile gpu reference implementation completely.

* ck-builder: clean up test_reference_execution

- Remove unneccesary messages
- Replace EXPECT_TRUE(true) with EXPECT_NO_THROW()
This commit is contained in:
Robin Voetter
2026-01-21 19:18:47 +01:00
committed by GitHub
parent 0fbb3bb8c4
commit 1040d9b1f5
24 changed files with 291 additions and 1067 deletions

View File

@@ -23,7 +23,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvBwdWeightDlFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvBwdWeightMultiDWmmaV3Factory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvBwdWeightMultiDXdlFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvBwdWeightTwoStageWmmaV3Factory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvBwdWeightTwoStageXdlFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvBwdWeightWmmaFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvBwdWeightWmmaV3Factory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvBwdWeightXdlFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvBwdWeightXdlV3Factory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -24,7 +24,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvFwdDlFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvFwdLargeTensorFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvFwdXdlV3Factory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvFwdWmmaFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvFwdXdlFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -29,7 +29,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
struct ConvTileFactory
{
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Layouts = internal::TileConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = internal::TileConvTensorLayouts<SIGNATURE>;
using Types = internal::TileConvTensorTypes<SIGNATURE.data_type>;
using Ops = internal::TileElementwiseOps<SIGNATURE>;
using AlgorithmType = decltype(ALGORITHM);

View File

@@ -172,10 +172,10 @@ struct LayoutToCK<TensorLayout::GNDHWK>
using type = ck::tensor_layout::convolution::GNDHWK;
};
template <TensorLayout Layout>
template <TensorLayout LAYOUT>
consteval auto TensorLayoutToCK()
{
return typename LayoutToCK<Layout>::type{};
return typename LayoutToCK<LAYOUT>::type{};
}
struct EmptyAuxiliaryTensorLayout
@@ -183,49 +183,52 @@ struct EmptyAuxiliaryTensorLayout
using type = ck::Tuple<>;
};
template <auto AuxiliaryTensorConfigsArray, size_t... Indices>
template <auto AUXILIARY_TENSOR_CONFIGS_ARRAY, size_t... Indices>
consteval auto GetAuxiliaryTensorLayoutTuple(std::index_sequence<Indices...>)
{
return ck::Tuple<
decltype(TensorLayoutToCK<AuxiliaryTensorConfigsArray[Indices].layout>())...>{};
decltype(TensorLayoutToCK<AUXILIARY_TENSOR_CONFIGS_ARRAY[Indices].layout>())...>{};
}
template <auto AuxiliaryTensorConfigsValue, size_t SPATIAL_DIM>
template <auto AUXILIARY_TENSOR_CONFIGS_VALUE, size_t SPATIAL_DIM>
requires(ConvSpatialDim<SPATIAL_DIM>)
struct AuxiliaryTensorLayouts
{
static constexpr auto Size = AuxiliaryTensorConfigsValue.size();
using type = decltype(GetAuxiliaryTensorLayoutTuple<AuxiliaryTensorConfigsValue>(
static constexpr auto Size = AUXILIARY_TENSOR_CONFIGS_VALUE.size();
using type = decltype(GetAuxiliaryTensorLayoutTuple<AUXILIARY_TENSOR_CONFIGS_VALUE>(
std::make_index_sequence<Size>{}));
};
// TODO: Currently only the ouput tensor can have auxiliary tensors (e.g., bias).
template <auto Signature, size_t SPATIAL_DIM>
requires(HasElementwiseOpWithAuxiliaryOperands<decltype(Signature.output)>)
template <auto SIGNATURE>
requires HasElementwiseOpWithAuxiliaryOperands<decltype(SIGNATURE.output)>
consteval auto GetAuxiliaryTensorLayouts()
{
return AuxiliaryTensorLayouts<Signature.output.operation.auxiliary_operand_configs,
SPATIAL_DIM>{};
return AuxiliaryTensorLayouts<SIGNATURE.output.operation.auxiliary_operand_configs,
SIGNATURE.spatial_dim>{};
}
template <auto Signature, size_t SPATIAL_DIM>
requires(!HasElementwiseOpWithAuxiliaryOperands<decltype(Signature.output)>)
template <auto SIGNATURE>
requires(!HasElementwiseOpWithAuxiliaryOperands<decltype(SIGNATURE.output)>)
consteval auto GetAuxiliaryTensorLayouts()
{
return EmptyAuxiliaryTensorLayout{};
}
template <auto Signature, size_t SPATIAL_DIM>
requires(ConvSpatialDim<SPATIAL_DIM> &&
ValidConvInputLayoutForSpatialDim<Signature.input.config.layout, SPATIAL_DIM> &&
ValidConvWeightLayoutForSpatialDim<Signature.weight.config.layout, SPATIAL_DIM> &&
ValidConvOutputLayoutForSpatialDim<Signature.output.config.layout, SPATIAL_DIM>)
template <auto SIGNATURE>
requires ConvSpatialDim<SIGNATURE.spatial_dim> &&
ValidConvInputLayoutForSpatialDim<SIGNATURE.input.config.layout,
SIGNATURE.spatial_dim> &&
ValidConvWeightLayoutForSpatialDim<SIGNATURE.weight.config.layout,
SIGNATURE.spatial_dim> &&
ValidConvOutputLayoutForSpatialDim<SIGNATURE.output.config.layout,
SIGNATURE.spatial_dim>
struct ConvTensorLayouts
{
using InLayout = decltype(TensorLayoutToCK<Signature.input.config.layout>());
using WeiLayout = decltype(TensorLayoutToCK<Signature.weight.config.layout>());
using OutLayout = decltype(TensorLayoutToCK<Signature.output.config.layout>());
using DsLayout = decltype(GetAuxiliaryTensorLayouts<Signature, SPATIAL_DIM>())::type;
using InLayout = decltype(TensorLayoutToCK<SIGNATURE.input.config.layout>());
using WeiLayout = decltype(TensorLayoutToCK<SIGNATURE.weight.config.layout>());
using OutLayout = decltype(TensorLayoutToCK<SIGNATURE.output.config.layout>());
using DsLayout = decltype(GetAuxiliaryTensorLayouts<SIGNATURE>())::type;
};
} // namespace ck_tile::builder::factory::internal

View File

@@ -9,10 +9,10 @@
namespace ck_tile::builder::factory::internal {
using ALayout = ck_tile::tensor_layout::convolution::NWGC;
template <TensorLayout Layout>
template <TensorLayout LAYOUT>
struct LayoutToCKTile
{
static_assert(sizeof(UnsupportedEnumValue<Layout>) == 0,
static_assert(sizeof(UnsupportedEnumValue<LAYOUT>) == 0,
"Unsupported layout conversion to CK.");
};
@@ -152,49 +152,52 @@ struct EmptyAuxiliaryTileTensorLayout
using type = ck_tile::tuple<>;
};
template <auto AuxiliaryTileTensorConfigsArray, size_t... Indices>
template <auto AUXILIARY_TILE_TENSOR_CONFIGS_ARRAY, size_t... Indices>
consteval auto GetAuxiliaryTileTensorLayoutTuple(std::index_sequence<Indices...>)
{
return ck_tile::tuple<
decltype(TensorLayoutToCKTile<AuxiliaryTileTensorConfigsArray[Indices].layout>())...>{};
decltype(TensorLayoutToCKTile<AUXILIARY_TILE_TENSOR_CONFIGS_ARRAY[Indices].layout>())...>{};
}
template <auto AuxiliaryTileTensorConfigsValue, size_t SPATIAL_DIM>
requires(ConvSpatialDim<SPATIAL_DIM>)
template <auto AUXILIARY_TILE_TENSOR_CONFIGS_VALUE, size_t SPATIAL_DIM>
requires ConvSpatialDim<SPATIAL_DIM>
struct AuxiliaryTileTensorLayouts
{
static constexpr auto Size = AuxiliaryTileTensorConfigsValue.size();
using type = decltype(GetAuxiliaryTileTensorLayoutTuple<AuxiliaryTileTensorConfigsValue>(
static constexpr auto Size = AUXILIARY_TILE_TENSOR_CONFIGS_VALUE.size();
using type = decltype(GetAuxiliaryTileTensorLayoutTuple<AUXILIARY_TILE_TENSOR_CONFIGS_VALUE>(
std::make_index_sequence<Size>{}));
};
// TODO: Currently only the ouput tensor can have auxiliary tensors (e.g., bias).
template <auto Signature, size_t SPATIAL_DIM>
requires(HasElementwiseOpWithAuxiliaryOperands<decltype(Signature.output)>)
template <auto SIGNATURE>
requires HasElementwiseOpWithAuxiliaryOperands<decltype(SIGNATURE.output)>
consteval auto GetAuxiliaryTileTensorLayouts()
{
return AuxiliaryTileTensorLayouts<Signature.output.operation.auxiliary_operand_configs,
SPATIAL_DIM>{};
return AuxiliaryTileTensorLayouts<SIGNATURE.output.operation.auxiliary_operand_configs,
SIGNATURE.spatial_dim>{};
}
template <auto Signature, size_t SPATIAL_DIM>
requires(!HasElementwiseOpWithAuxiliaryOperands<decltype(Signature.output)>)
template <auto SIGNATURE>
requires(!HasElementwiseOpWithAuxiliaryOperands<decltype(SIGNATURE.output)>)
consteval auto GetAuxiliaryTileTensorLayouts()
{
return EmptyAuxiliaryTileTensorLayout{};
}
template <auto Signature, size_t SPATIAL_DIM>
requires(ConvSpatialDim<SPATIAL_DIM> &&
ValidConvInputLayoutForSpatialDim<Signature.input.config.layout, SPATIAL_DIM> &&
ValidConvWeightLayoutForSpatialDim<Signature.weight.config.layout, SPATIAL_DIM> &&
ValidConvOutputLayoutForSpatialDim<Signature.output.config.layout, SPATIAL_DIM>)
template <auto SIGNATURE>
requires ConvSpatialDim<SIGNATURE.spatial_dim> &&
ValidConvInputLayoutForSpatialDim<SIGNATURE.input.config.layout,
SIGNATURE.spatial_dim> &&
ValidConvWeightLayoutForSpatialDim<SIGNATURE.weight.config.layout,
SIGNATURE.spatial_dim> &&
ValidConvOutputLayoutForSpatialDim<SIGNATURE.output.config.layout,
SIGNATURE.spatial_dim>
struct TileConvTensorLayouts
{
using ALayout = decltype(TensorLayoutToCKTile<Signature.input.config.layout>());
using BLayout = decltype(TensorLayoutToCKTile<Signature.weight.config.layout>());
using ELayout = decltype(TensorLayoutToCKTile<Signature.output.config.layout>());
using DsLayout = decltype(GetAuxiliaryTileTensorLayouts<Signature, SPATIAL_DIM>())::type;
using ALayout = decltype(TensorLayoutToCKTile<SIGNATURE.input.config.layout>());
using BLayout = decltype(TensorLayoutToCKTile<SIGNATURE.weight.config.layout>());
using ELayout = decltype(TensorLayoutToCKTile<SIGNATURE.output.config.layout>());
using DsLayout = decltype(GetAuxiliaryTileTensorLayouts<SIGNATURE>())::type;
};
} // namespace ck_tile::builder::factory::internal

View File

@@ -1,118 +0,0 @@
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
// SPDX-License-Identifier: MIT
#pragma once
#include "ck_tile/core.hpp"
#include "ck_tile/builder/conv_signature_concepts.hpp"
#include "ck_tile/builder/types.hpp"
#include <vector>
namespace ck_tile::builder::factory::internal {
// Validation helper: Ensure reference implementation only receives PassThrough elementwise ops
template <auto SIGNATURE>
consteval void ValidateReferenceSignature()
{
using namespace ck_tile::builder;
// Check input elementwise operation
static_assert(
!HasTensorOp<decltype(SIGNATURE.input)> ||
SIGNATURE.input.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH,
"Reference implementation does not support elementwise operations on input tensor. "
"Input operation must be PassThrough (or not specified).");
// Check weight elementwise operation
static_assert(
!HasTensorOp<decltype(SIGNATURE.weight)> ||
SIGNATURE.weight.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH,
"Reference implementation does not support elementwise operations on weight tensor. "
"Weight operation must be PassThrough (or not specified).");
// Check output elementwise operation
static_assert(
!HasTensorOp<decltype(SIGNATURE.output)> ||
SIGNATURE.output.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH,
"Reference implementation does not support elementwise operations on output tensor. "
"Output operation must be PassThrough (or not specified).");
}
// Common argument structure for reference convolution implementations
// Template parameters allow different const qualifiers for each direction
template <typename InPtrType, typename WeiPtrType, typename OutPtrType>
struct ReferenceConvArgument
{
InPtrType input_;
WeiPtrType weight_;
OutPtrType output_;
int G_, N_, K_, C_;
std::vector<ck_tile::long_index_t> input_spatial_;
std::vector<ck_tile::long_index_t> filter_spatial_;
std::vector<ck_tile::long_index_t> output_spatial_;
std::vector<ck_tile::long_index_t> strides_;
std::vector<ck_tile::long_index_t> dilations_;
std::vector<ck_tile::long_index_t> left_pads_;
ReferenceConvArgument(InPtrType input,
WeiPtrType weight,
OutPtrType output,
int G,
int N,
int K,
int C,
const std::vector<ck_tile::long_index_t>& input_spatial,
const std::vector<ck_tile::long_index_t>& filter_spatial,
const std::vector<ck_tile::long_index_t>& output_spatial,
const std::vector<ck_tile::long_index_t>& strides,
const std::vector<ck_tile::long_index_t>& dilations,
const std::vector<ck_tile::long_index_t>& left_pads)
: input_(input),
weight_(weight),
output_(output),
G_(G),
N_(N),
K_(K),
C_(C),
input_spatial_(input_spatial),
filter_spatial_(filter_spatial),
output_spatial_(output_spatial),
strides_(strides),
dilations_(dilations),
left_pads_(left_pads)
{
}
};
// Common invoker structure for reference convolution implementations
// Takes a callable (lambda or function pointer) to execute the actual convolution
template <typename ArgumentType, typename ConvFunc>
struct ReferenceConvInvoker
{
ConvFunc conv_func_;
explicit ReferenceConvInvoker(ConvFunc func) : conv_func_(func) {}
float Run(const ArgumentType* arg, const StreamConfig& stream_config = StreamConfig{})
{
(void)stream_config; // Unused for reference implementation
conv_func_(arg->input_,
arg->weight_,
arg->output_,
arg->G_,
arg->N_,
arg->K_,
arg->C_,
arg->input_spatial_,
arg->filter_spatial_,
arg->output_spatial_,
arg->strides_,
arg->dilations_,
arg->left_pads_);
return 0.0f; // Reference implementation doesn't track timing
}
};
} // namespace ck_tile::builder::factory::internal

View File

@@ -3,15 +3,15 @@
#pragma once
#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp"
#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp"
#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp"
#include "ck_tile/builder/conv_signature_concepts.hpp"
#include "ck_tile/builder/conv_algorithm_concepts.hpp"
#include "ck_tile/builder/types.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
#include "ck_tile/builder/factory/reference_common.hpp"
#include "ck_tile/core.hpp"
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
#include "ck/library/utility/convolution_parameter.hpp"
#include <memory>
namespace ck_tile::builder::factory {
@@ -22,16 +22,23 @@ template <ConvSignatureDescriptor auto SIGNATURE,
StringLiteral VERSION>
struct ReferenceFactory
{
// Validate that only PassThrough elementwise operations are specified
static constexpr auto kValidation = (internal::ValidateReferenceSignature<SIGNATURE>(), 0);
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
using InDataType = typename Types::InDataType;
using WeiDataType = typename Types::WeiDataType;
using OutDataType = typename Types::OutDataType;
using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE>;
using InLayout = typename Layouts::InLayout;
using WeiLayout = typename Layouts::WeiLayout;
using OutLayout = typename Layouts::OutLayout;
using Ops = factory::internal::ConvElementwiseOps<SIGNATURE>;
using InElementwiseOp = typename Ops::InElementwiseOp;
using WeiElementwiseOp = typename Ops::WeiElementwiseOp;
using OutElementwiseOp = typename Ops::OutElementwiseOp;
struct Instance
{
// Store template parameters for InstanceTraits reflection
@@ -39,91 +46,57 @@ struct ReferenceFactory
static constexpr auto kAlgorithm = ALGORITHM;
static constexpr auto kVersion = VERSION;
// Argument and Invoker types depend on direction
// Forward: const input, const weight, mutable output
// Backward Data: mutable input, const weight, const output_grad
// Backward Weight: const input, mutable weight_grad, const output_grad
// Use appropriate Argument type based on direction
using Argument = std::conditional_t<
ConvDirectionIsForward<SIGNATURE>,
internal::ReferenceConvArgument<const InDataType*, const WeiDataType*, OutDataType*>,
std::conditional_t<
ConvDirectionIsBackwardData<SIGNATURE>,
internal::
ReferenceConvArgument<InDataType*, const WeiDataType*, const OutDataType*>,
internal::
ReferenceConvArgument<const InDataType*, WeiDataType*, const OutDataType*>>>;
// Invoker calls the appropriate reference implementation based on direction
struct Invoker
/// @brief Invoke reference convolution
///
/// This is the primary overload to invoke reference convolution. As the underlying
/// function requires it, this function accepts ConvParam directly.
template <typename InPtrType, typename WeiPtrType, typename OutPtrType>
static void Run(InPtrType* input,
WeiPtrType* weight,
OutPtrType* output,
const ck::utils::conv::ConvParam& param,
InElementwiseOp in_op = InElementwiseOp{},
WeiElementwiseOp wei_op = WeiElementwiseOp{},
OutElementwiseOp out_op = OutElementwiseOp{})
{
float Run(const Argument* arg, const StreamConfig& stream_config = StreamConfig{})
if constexpr(ConvDirectionIsForward<SIGNATURE>)
{
(void)stream_config; // Unused for reference implementation
if constexpr(ConvDirectionIsForward<SIGNATURE>)
{
ck_tile::
naive_grouped_conv_fwd<SPATIAL_DIM, InDataType, WeiDataType, OutDataType>(
arg->input_,
arg->weight_,
arg->output_,
arg->G_,
arg->N_,
arg->K_,
arg->C_,
arg->input_spatial_,
arg->filter_spatial_,
arg->output_spatial_,
arg->strides_,
arg->dilations_,
arg->left_pads_);
}
else if constexpr(ConvDirectionIsBackwardData<SIGNATURE>)
{
ck_tile::naive_grouped_conv_bwd_data<SPATIAL_DIM,
InDataType,
WeiDataType,
OutDataType>(arg->input_,
arg->weight_,
arg->output_,
arg->G_,
arg->N_,
arg->K_,
arg->C_,
arg->input_spatial_,
arg->filter_spatial_,
arg->output_spatial_,
arg->strides_,
arg->dilations_,
arg->left_pads_);
}
else if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
{
ck_tile::naive_grouped_conv_bwd_weight<SPATIAL_DIM,
InDataType,
WeiDataType,
OutDataType>(arg->input_,
arg->weight_,
arg->output_,
arg->G_,
arg->N_,
arg->K_,
arg->C_,
arg->input_spatial_,
arg->filter_spatial_,
arg->output_spatial_,
arg->strides_,
arg->dilations_,
arg->left_pads_);
}
return 0.0f; // Reference implementation doesn't track timing
ck::ref::naive_conv_fwd<InLayout, WeiLayout, OutLayout>(
static_cast<const InDataType*>(input),
static_cast<const WeiDataType*>(weight),
static_cast<OutDataType*>(output),
param,
in_op,
wei_op,
out_op);
}
};
else if constexpr(ConvDirectionIsBackwardData<SIGNATURE>)
{
ck::ref::naive_conv_bwd_data<InLayout, WeiLayout, OutLayout>(
static_cast<InDataType*>(input),
static_cast<const WeiDataType*>(weight),
static_cast<const OutDataType*>(output),
param,
in_op,
wei_op,
out_op);
}
else if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
{
ck::ref::naive_conv_bwd_weight<InLayout, WeiLayout, OutLayout>(
static_cast<const InDataType*>(input),
static_cast<WeiDataType*>(weight),
static_cast<const OutDataType*>(output),
param,
in_op,
wei_op,
out_op);
}
}
// Direct Run method (simpler interface, direction-agnostic)
/// @brief Invoke reference convolution
///
/// Convenience overload to avoid having to construct ConvParam manually.
template <typename InPtrType, typename WeiPtrType, typename OutPtrType>
static void Run(InPtrType* input,
WeiPtrType* weight,
@@ -132,68 +105,27 @@ struct ReferenceFactory
int N,
int K,
int C,
const std::vector<ck_tile::long_index_t>& input_spatial,
const std::vector<ck_tile::long_index_t>& filter_spatial,
const std::vector<ck_tile::long_index_t>& output_spatial,
const std::vector<ck_tile::long_index_t>& strides,
const std::vector<ck_tile::long_index_t>& dilations,
const std::vector<ck_tile::long_index_t>& left_pads)
const std::vector<ck::long_index_t>& input_spatial,
const std::vector<ck::long_index_t>& filter_spatial,
const std::vector<ck::long_index_t>& strides,
const std::vector<ck::long_index_t>& dilations,
const std::vector<ck::long_index_t>& left_pads,
const std::vector<ck::long_index_t>& right_pads)
{
if constexpr(ConvDirectionIsForward<SIGNATURE>)
{
ck_tile::naive_grouped_conv_fwd<SPATIAL_DIM, InDataType, WeiDataType, OutDataType>(
static_cast<const InDataType*>(input),
static_cast<const WeiDataType*>(weight),
static_cast<OutDataType*>(output),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
}
else if constexpr(ConvDirectionIsBackwardData<SIGNATURE>)
{
ck_tile::
naive_grouped_conv_bwd_data<SPATIAL_DIM, InDataType, WeiDataType, OutDataType>(
static_cast<InDataType*>(input),
static_cast<const WeiDataType*>(weight),
static_cast<const OutDataType*>(output),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
}
else if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
{
ck_tile::naive_grouped_conv_bwd_weight<SPATIAL_DIM,
InDataType,
WeiDataType,
OutDataType>(
static_cast<const InDataType*>(input),
static_cast<WeiDataType*>(weight),
static_cast<const OutDataType*>(output),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
}
Run(input,
weight,
output,
ck::utils::conv::ConvParam(SPATIAL_DIM,
G,
N,
K,
C,
filter_spatial,
input_spatial,
strides,
dilations,
left_pads,
right_pads));
}
std::string GetTypeString() const
@@ -209,41 +141,6 @@ struct ReferenceFactory
return std::string("GPU_Reference_") + dir_str + "_" + std::to_string(SPATIAL_DIM) +
"D";
}
// Old CK interface: Create argument pointer
template <typename InPtrType, typename WeiPtrType, typename OutPtrType>
std::unique_ptr<Argument>
MakeArgumentPointer(InPtrType input,
WeiPtrType weight,
OutPtrType output,
int G,
int N,
int K,
int C,
const std::vector<ck_tile::long_index_t>& input_spatial,
const std::vector<ck_tile::long_index_t>& filter_spatial,
const std::vector<ck_tile::long_index_t>& output_spatial,
const std::vector<ck_tile::long_index_t>& strides,
const std::vector<ck_tile::long_index_t>& dilations,
const std::vector<ck_tile::long_index_t>& left_pads) const
{
return std::make_unique<Argument>(input,
weight,
output,
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
}
// Old CK interface: Create invoker pointer
std::unique_ptr<Invoker> MakeInvokerPointer() const { return std::make_unique<Invoker>(); }
};
};

View File

@@ -76,7 +76,7 @@ struct Args<SIGNATURE>
using Ops = factory::internal::ConvElementwiseOps<SIGNATURE>;
// TODO: We shouldn't need to call into an internal namespace here.
using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE>;
ConvTensorLengths<SPATIAL_DIM> lengths;

View File

@@ -32,27 +32,8 @@ concept RefConvInstance = requires(Conv& conv,
const void* input,
const void* weight,
void* output,
int G,
int N,
int K,
int C,
std::vector<long_index_t> dims) {
{
conv.Run(input,
weight,
output,
G,
N,
K,
C,
dims, // input_spatial
dims, // filter_spatial
dims, // output_spatial
dims, // strides
dims, // dilations
dims // left_pads
)
};
ck::utils::conv::ConvParam param) {
{ conv.Run(input, weight, output, param) };
};
/// @brief `run()` specialization for forward convolution and the reference
@@ -84,16 +65,6 @@ std::tuple<bool, float> run(RefConvInstance<SIGNATURE> auto& conv,
// Just throw for now, but regard these as TODO items that should be resolved
// eventually.
// Right pads are not supported right now for some reason.
for(auto right_pad : param.input_right_pads_)
{
if(right_pad != 0)
{
std::cout << "TODO: Support right pad in reference conv" << std::endl;
return std::make_tuple(false, 0.0f);
}
}
if(!args.make_input_descriptor().is_packed())
{
std::cout << "TODO: Support non-packed input tensor in reference conv" << std::endl;
@@ -110,19 +81,7 @@ std::tuple<bool, float> run(RefConvInstance<SIGNATURE> auto& conv,
return std::make_tuple(false, 0.0f);
}
conv.Run(inputs.input,
inputs.weight,
outputs.output,
param.G_,
param.N_,
param.K_,
param.C_,
param.input_spatial_lengths_,
param.filter_spatial_lengths_,
param.output_spatial_lengths_,
param.conv_filter_strides_,
param.conv_filter_dilations_,
param.input_left_pads_);
conv.Run(inputs.input, inputs.weight, outputs.output, param);
return std::make_tuple(true, 0.0f);
}

View File

@@ -38,7 +38,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_NWGC_GKXC_NWGK)
.weight = {.config = {.layout = GKXC}},
.output = {.config = {.layout = NWGK}}};
using TensorLayouts = ConvTensorLayouts<sig, 1>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NWGC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKXC>));
@@ -57,7 +57,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_NGCW_GKXC_NGKW)
.weight = {.config = {.layout = GKXC}},
.output = {.config = {.layout = NGKW}}};
using TensorLayouts = ConvTensorLayouts<sig, 1>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCW>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKXC>));
@@ -76,7 +76,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_GNWC_GKXC_GNWK)
.weight = {.config = {.layout = GKXC}},
.output = {.config = {.layout = GNWK}}};
using TensorLayouts = ConvTensorLayouts<sig, 1>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::GNWC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKXC>));
@@ -95,7 +95,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_NGCW_GKCX_NGKW)
.weight = {.config = {.layout = GKCX}},
.output = {.config = {.layout = NGKW}}};
using TensorLayouts = ConvTensorLayouts<sig, 1>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCW>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKCX>));
@@ -114,7 +114,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_NGCHW_GKYXC_NGKHW)
.weight = {.config = {.layout = GKYXC}},
.output = {.config = {.layout = NGKHW}}};
using TensorLayouts = ConvTensorLayouts<sig, 2>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCHW>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
@@ -133,7 +133,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_NHWGC_GKYXC_NHWGK)
.weight = {.config = {.layout = GKYXC}},
.output = {.config = {.layout = NHWGK}}};
using TensorLayouts = ConvTensorLayouts<sig, 2>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NHWGC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
@@ -152,7 +152,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_GNHWC_GKYXC_GNHWK)
.weight = {.config = {.layout = GKYXC}},
.output = {.config = {.layout = GNHWK}}};
using TensorLayouts = ConvTensorLayouts<sig, 2>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::GNHWC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
@@ -171,7 +171,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_NGCHW_GKCYX_NGKHW)
.weight = {.config = {.layout = GKCYX}},
.output = {.config = {.layout = NGKHW}}};
using TensorLayouts = ConvTensorLayouts<sig, 2>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCHW>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKCYX>));
@@ -190,7 +190,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor3D_NGCDHW_GKCZYX_NGKDHW)
.weight = {.config = {.layout = GKCZYX}},
.output = {.config = {.layout = NGKDHW}}};
using TensorLayouts = ConvTensorLayouts<sig, 3>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCDHW>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKCZYX>));
@@ -209,7 +209,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor3D_NDHWGC_GKZYXC_NDHWGK)
.weight = {.config = {.layout = GKZYXC}},
.output = {.config = {.layout = NDHWGK}}};
using TensorLayouts = ConvTensorLayouts<sig, 3>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NDHWGC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKZYXC>));
@@ -228,7 +228,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor3D_GNDHWC_GKZYXC_GNDHWK)
.weight = {.config = {.layout = GKZYXC}},
.output = {.config = {.layout = GNDHWK}}};
using TensorLayouts = ConvTensorLayouts<sig, 3>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::GNDHWC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKZYXC>));
@@ -387,7 +387,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv2DWithSingleBiasG_K)
.operation =
OutputOp{.elementwise_operation = ElementwiseOperation::SCALE}}};
using TensorLayouts = ConvTensorLayouts<sig, 2>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCHW>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
@@ -414,7 +414,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv2DWithSingleBiasGC)
.operation =
OutputOp{.elementwise_operation = ElementwiseOperation::SCALE}}};
using TensorLayouts = ConvTensorLayouts<sig, 2>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NHWGC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
@@ -442,7 +442,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv2DWithTwoAuxiliaryTensors)
.operation = OutputOp{.elementwise_operation =
ElementwiseOperation::SCALEADD_SCALEADD_RELU}}};
using TensorLayouts = ConvTensorLayouts<sig, 2>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::GNHWC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
@@ -470,7 +470,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv1DWithBias)
.operation =
OutputOp{.elementwise_operation = ElementwiseOperation::SCALE}}};
using TensorLayouts = ConvTensorLayouts<sig, 1>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NWGC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKXC>));
@@ -497,7 +497,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv3DWithBias)
.operation = OutputOp{.elementwise_operation =
ElementwiseOperation::BIAS_BNORM_CLAMP}}};
using TensorLayouts = ConvTensorLayouts<sig, 3>;
using TensorLayouts = ConvTensorLayouts<sig>;
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NDHWGC>));
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKZYXC>));

View File

@@ -4,10 +4,10 @@
#include "ck_tile/builder/conv_builder.hpp"
#include "ck_tile/builder/types.hpp"
#include "impl/conv_algorithm_types.hpp"
#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp"
#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp"
#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp"
#include "utils/ckb_conv_test_configs.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
#include "ck/library/utility/device_memory.hpp"
#include "ck/library/utility/check_err.hpp"
#include <gtest/gtest.h>
@@ -53,29 +53,25 @@ TEST(ReferenceExecution, Forward_2D_FP16)
// Prepare parameters for Run()
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
std::vector<ck_tile::long_index_t> right_pads{1, 1};
RefKernel ref_kernel;
ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(out_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
// If we get here, Run() worked!
std::cout << "✓ Reference Forward kernel executed!" << std::endl;
EXPECT_TRUE(true);
EXPECT_NO_THROW(ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(out_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
strides,
dilations,
left_pads,
right_pads));
}
TEST(ReferenceExecution, BackwardData_2D_FP16)
@@ -109,28 +105,26 @@ TEST(ReferenceExecution, BackwardData_2D_FP16)
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
std::vector<ck_tile::long_index_t> right_pads{1, 1};
RefKernel ref_kernel;
ref_kernel.Run(reinterpret_cast<ck::half_t*>(in_grad_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
std::cout << "✓ Reference Backward Data kernel executed!" << std::endl;
EXPECT_TRUE(true);
EXPECT_NO_THROW(
ref_kernel.Run(reinterpret_cast<ck::half_t*>(in_grad_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
strides,
dilations,
left_pads,
right_pads));
}
TEST(ReferenceExecution, BackwardWeight_2D_FP16)
@@ -164,217 +158,26 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16)
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
std::vector<ck_tile::long_index_t> right_pads{1, 1};
RefKernel ref_kernel;
ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(wei_grad_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
std::cout << "✓ Reference Backward Weight kernel executed!" << std::endl;
EXPECT_TRUE(true);
}
// Test the old CK interface: MakeArgumentPointer + MakeInvokerPointer
TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface)
{
constexpr ConvSignature sig{.spatial_dim = 2,
.direction = ConvDirection::BACKWARD_DATA,
.data_type = DataType::FP16,
.accumulation_data_type = DataType::FP32,
.input = {.config = {.layout = TensorLayout::NHWGC}},
.weight = {.config = {.layout = TensorLayout::GKYXC}},
.output = {.config = {.layout = TensorLayout::NHWGK}}};
constexpr auto ref_alg = ConvAlgorithm_Reference{};
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t);
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
ck::DeviceMem in_grad_dev(in_grad_size);
ck::DeviceMem wei_dev(wei_size);
ck::DeviceMem out_grad_dev(out_grad_size);
in_grad_dev.SetZero();
wei_dev.SetZero();
out_grad_dev.SetZero();
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
RefKernel ref_kernel;
// TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer)
auto argument_ptr = ref_kernel.MakeArgumentPointer(
reinterpret_cast<ck::half_t*>(in_grad_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
auto invoker_ptr = ref_kernel.MakeInvokerPointer();
// Run using invoker
float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
std::cout << "✓ Reference Backward Data kernel executed via Invoker interface!" << std::endl;
std::cout << " (time = " << time << " ms)" << std::endl;
EXPECT_TRUE(true);
}
// Test the old CK interface for Forward convolution
TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface)
{
constexpr ConvSignature sig{.spatial_dim = 2,
.direction = ConvDirection::FORWARD,
.data_type = DataType::FP16,
.accumulation_data_type = DataType::FP32,
.input = {.config = {.layout = TensorLayout::GNHWC}},
.weight = {.config = {.layout = TensorLayout::GKYXC}},
.output = {.config = {.layout = TensorLayout::GNHWK}}};
constexpr auto ref_alg = ConvAlgorithm_Reference{};
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
const size_t out_size = G * N * K * H * W * sizeof(ck::half_t);
ck::DeviceMem in_dev(in_size);
ck::DeviceMem wei_dev(wei_size);
ck::DeviceMem out_dev(out_size);
in_dev.SetZero();
wei_dev.SetZero();
out_dev.SetZero();
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
RefKernel ref_kernel;
// TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer)
auto argument_ptr = ref_kernel.MakeArgumentPointer(
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(out_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
auto invoker_ptr = ref_kernel.MakeInvokerPointer();
// Run using invoker
float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
std::cout << "✓ Reference Forward kernel executed via Invoker interface!" << std::endl;
std::cout << " (time = " << time << " ms)" << std::endl;
EXPECT_TRUE(true);
}
// Test the old CK interface for Backward Weight convolution
TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface)
{
constexpr ConvSignature sig{.spatial_dim = 2,
.direction = ConvDirection::BACKWARD_WEIGHT,
.data_type = DataType::FP16,
.accumulation_data_type = DataType::FP32,
.input = {.config = {.layout = TensorLayout::GNHWC}},
.weight = {.config = {.layout = TensorLayout::GKYXC}},
.output = {.config = {.layout = TensorLayout::GNHWK}}};
constexpr auto ref_alg = ConvAlgorithm_Reference{};
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
ck::DeviceMem in_dev(in_size);
ck::DeviceMem wei_grad_dev(wei_grad_size);
ck::DeviceMem out_grad_dev(out_grad_size);
in_dev.SetZero();
wei_grad_dev.SetZero();
out_grad_dev.SetZero();
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
RefKernel ref_kernel;
// TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer)
auto argument_ptr = ref_kernel.MakeArgumentPointer(
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(wei_grad_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
auto invoker_ptr = ref_kernel.MakeInvokerPointer();
// Run using invoker
float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
std::cout << "✓ Reference Backward Weight kernel executed via Invoker interface!" << std::endl;
std::cout << " (time = " << time << " ms)" << std::endl;
EXPECT_TRUE(true);
EXPECT_NO_THROW(
ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(wei_grad_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
strides,
dilations,
left_pads,
right_pads));
}
// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT
@@ -430,10 +233,10 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random)
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
std::vector<ck_tile::long_index_t> right_pads{1, 1};
RefKernel builder_kernel;
@@ -447,26 +250,35 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random)
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
left_pads,
right_pads);
// Run 2: Direct GPU Reference (same kernel the Builder calls internally!)
ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>(
ck::ref::naive_conv_fwd<ck::tensor_layout::convolution::NHWGC,
ck::tensor_layout::convolution::GKYXC,
ck::tensor_layout::convolution::NHWGK,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>(
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(out_naive_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
ck::utils::conv::ConvParam(2,
G,
N,
K,
C,
filter_spatial,
input_spatial,
strides,
dilations,
left_pads,
right_pads));
// Copy results back
std::vector<ck::half_t> out_builder_result(out_elements);
@@ -475,17 +287,11 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random)
out_naive_dev.FromDevice(out_naive_result.data());
// Compare - should be IDENTICAL (both call same kernel)
bool pass = ck::utils::check_err(out_builder_result,
EXPECT_TRUE(ck::utils::check_err(out_builder_result,
out_naive_result,
"Error: Builder Reference != Direct GPU Reference",
1e-6,
1e-6); // Very tight tolerance!
std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT)!" << std::endl;
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
std::cout << " This validates Builder Reference Factory is correct!" << std::endl;
EXPECT_TRUE(pass);
1e-6)); // Very tight tolerance!
}
// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Data
@@ -538,10 +344,10 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
std::vector<ck_tile::long_index_t> right_pads{1, 1};
RefKernel builder_kernel;
@@ -555,26 +361,35 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
left_pads,
right_pads);
// Run 2: Direct GPU Reference
ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>(
ck::ref::naive_conv_bwd_data<ck::tensor_layout::convolution::NHWGC,
ck::tensor_layout::convolution::GKYXC,
ck::tensor_layout::convolution::NHWGK,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>(
reinterpret_cast<ck::half_t*>(in_grad_naive_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
ck::utils::conv::ConvParam(2,
G,
N,
K,
C,
filter_spatial,
input_spatial,
strides,
dilations,
left_pads,
right_pads));
// Compare
std::vector<ck::half_t> in_grad_builder_result(in_grad_elements);
@@ -582,16 +397,11 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand
in_grad_builder_dev.FromDevice(in_grad_builder_result.data());
in_grad_naive_dev.FromDevice(in_grad_naive_result.data());
bool pass = ck::utils::check_err(in_grad_builder_result,
EXPECT_TRUE(ck::utils::check_err(in_grad_builder_result,
in_grad_naive_result,
"Error: Builder Backward Data != Direct GPU Reference",
1e-6,
1e-6);
std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Data)!"
<< std::endl;
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
EXPECT_TRUE(pass);
1e-6));
}
// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Weight
@@ -644,10 +454,10 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
std::vector<ck_tile::long_index_t> right_pads{1, 1};
RefKernel builder_kernel;
@@ -661,26 +471,35 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
left_pads,
right_pads);
// Run 2: Direct GPU Reference
ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>(
ck::ref::naive_conv_bwd_weight<ck::tensor_layout::convolution::NHWGC,
ck::tensor_layout::convolution::GKYXC,
ck::tensor_layout::convolution::NHWGK,
ck::half_t,
ck::half_t,
ck::half_t,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough,
ck::tensor_operation::element_wise::PassThrough>(
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(wei_grad_naive_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
ck::utils::conv::ConvParam(2,
G,
N,
K,
C,
filter_spatial,
input_spatial,
strides,
dilations,
left_pads,
right_pads));
// Compare
std::vector<ck::half_t> wei_grad_builder_result(wei_grad_elements);
@@ -688,344 +507,11 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra
wei_grad_builder_dev.FromDevice(wei_grad_builder_result.data());
wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data());
bool pass = ck::utils::check_err(wei_grad_builder_result,
EXPECT_TRUE(ck::utils::check_err(wei_grad_builder_result,
wei_grad_naive_result,
"Error: Builder Backward Weight != Direct GPU Reference",
1e-6,
1e-6);
std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Weight)!"
<< std::endl;
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
EXPECT_TRUE(pass);
}
// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Forward
TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random)
{
constexpr ConvSignature sig{.spatial_dim = 2,
.direction = ConvDirection::FORWARD,
.data_type = DataType::FP16,
.accumulation_data_type = DataType::FP32,
.input = {.config = {.layout = TensorLayout::NHWGC}},
.weight = {.config = {.layout = TensorLayout::GKYXC}},
.output = {.config = {.layout = TensorLayout::NHWGK}}};
constexpr auto ref_alg = ConvAlgorithm_Reference{};
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
const size_t out_size = G * N * K * H * W * sizeof(ck::half_t);
const size_t in_elements = G * N * C * H * W;
const size_t wei_elements = G * K * C * 3 * 3;
const size_t out_elements = G * N * K * H * W;
std::vector<ck::half_t> in_host(in_elements);
std::vector<ck::half_t> wei_host(wei_elements);
std::srand(12348);
for(size_t i = 0; i < in_elements; i++)
{
in_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
}
for(size_t i = 0; i < wei_elements; i++)
{
wei_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
}
ck::DeviceMem in_dev(in_size);
ck::DeviceMem wei_dev(wei_size);
ck::DeviceMem out_invoker_dev(out_size);
ck::DeviceMem out_naive_dev(out_size);
in_dev.ToDevice(in_host.data());
wei_dev.ToDevice(wei_host.data());
out_invoker_dev.SetZero();
out_naive_dev.SetZero();
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
RefKernel builder_kernel;
// Run 1: Builder Invoker Interface
auto argument_ptr = builder_kernel.MakeArgumentPointer(
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(out_invoker_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
auto invoker_ptr = builder_kernel.MakeInvokerPointer();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
// Run 2: Direct GPU Reference
ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>(
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(out_naive_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
// Compare
std::vector<ck::half_t> out_invoker_result(out_elements);
std::vector<ck::half_t> out_naive_result(out_elements);
out_invoker_dev.FromDevice(out_invoker_result.data());
out_naive_dev.FromDevice(out_naive_result.data());
bool pass = ck::utils::check_err(out_invoker_result,
out_naive_result,
"Error: Invoker Interface != Direct GPU Reference",
1e-6,
1e-6);
std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Forward)!" << std::endl;
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
EXPECT_TRUE(pass);
}
// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Data
TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random)
{
constexpr ConvSignature sig{.spatial_dim = 2,
.direction = ConvDirection::BACKWARD_DATA,
.data_type = DataType::FP16,
.accumulation_data_type = DataType::FP32,
.input = {.config = {.layout = TensorLayout::NHWGC}},
.weight = {.config = {.layout = TensorLayout::GKYXC}},
.output = {.config = {.layout = TensorLayout::NHWGK}}};
constexpr auto ref_alg = ConvAlgorithm_Reference{};
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t);
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
const size_t in_grad_elements = G * N * C * H * W;
const size_t wei_elements = G * K * C * 3 * 3;
const size_t out_grad_elements = G * N * K * H * W;
std::vector<ck::half_t> wei_host(wei_elements);
std::vector<ck::half_t> out_grad_host(out_grad_elements);
std::srand(12349);
for(size_t i = 0; i < wei_elements; i++)
{
wei_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
}
for(size_t i = 0; i < out_grad_elements; i++)
{
out_grad_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
}
ck::DeviceMem in_grad_invoker_dev(in_grad_size);
ck::DeviceMem in_grad_naive_dev(in_grad_size);
ck::DeviceMem wei_dev(wei_size);
ck::DeviceMem out_grad_dev(out_grad_size);
wei_dev.ToDevice(wei_host.data());
out_grad_dev.ToDevice(out_grad_host.data());
in_grad_invoker_dev.SetZero();
in_grad_naive_dev.SetZero();
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
RefKernel builder_kernel;
// Run 1: Builder Invoker Interface
auto argument_ptr = builder_kernel.MakeArgumentPointer(
reinterpret_cast<ck::half_t*>(in_grad_invoker_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
auto invoker_ptr = builder_kernel.MakeInvokerPointer();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
// Run 2: Direct GPU Reference
ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>(
reinterpret_cast<ck::half_t*>(in_grad_naive_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
// Compare
std::vector<ck::half_t> in_grad_invoker_result(in_grad_elements);
std::vector<ck::half_t> in_grad_naive_result(in_grad_elements);
in_grad_invoker_dev.FromDevice(in_grad_invoker_result.data());
in_grad_naive_dev.FromDevice(in_grad_naive_result.data());
bool pass =
ck::utils::check_err(in_grad_invoker_result,
in_grad_naive_result,
"Error: Invoker Interface != Direct GPU Reference (Backward Data)",
1e-6,
1e-6);
std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Data)!"
<< std::endl;
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
EXPECT_TRUE(pass);
}
// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Weight
TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random)
{
constexpr ConvSignature sig{.spatial_dim = 2,
.direction = ConvDirection::BACKWARD_WEIGHT,
.data_type = DataType::FP16,
.accumulation_data_type = DataType::FP32,
.input = {.config = {.layout = TensorLayout::NHWGC}},
.weight = {.config = {.layout = TensorLayout::GKYXC}},
.output = {.config = {.layout = TensorLayout::NHWGK}}};
constexpr auto ref_alg = ConvAlgorithm_Reference{};
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
const size_t in_elements = G * N * C * H * W;
const size_t wei_grad_elements = G * K * C * 3 * 3;
const size_t out_grad_elements = G * N * K * H * W;
std::vector<ck::half_t> in_host(in_elements);
std::vector<ck::half_t> out_grad_host(out_grad_elements);
std::srand(12350);
for(size_t i = 0; i < in_elements; i++)
{
in_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
}
for(size_t i = 0; i < out_grad_elements; i++)
{
out_grad_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
}
ck::DeviceMem in_dev(in_size);
ck::DeviceMem wei_grad_invoker_dev(wei_grad_size);
ck::DeviceMem wei_grad_naive_dev(wei_grad_size);
ck::DeviceMem out_grad_dev(out_grad_size);
in_dev.ToDevice(in_host.data());
out_grad_dev.ToDevice(out_grad_host.data());
wei_grad_invoker_dev.SetZero();
wei_grad_naive_dev.SetZero();
std::vector<ck_tile::long_index_t> input_spatial{H, W};
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
std::vector<ck_tile::long_index_t> output_spatial{H, W};
std::vector<ck_tile::long_index_t> strides{1, 1};
std::vector<ck_tile::long_index_t> dilations{1, 1};
std::vector<ck_tile::long_index_t> left_pads{1, 1};
RefKernel builder_kernel;
// Run 1: Builder Invoker Interface
auto argument_ptr = builder_kernel.MakeArgumentPointer(
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(wei_grad_invoker_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
auto invoker_ptr = builder_kernel.MakeInvokerPointer();
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
// Run 2: Direct GPU Reference
ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>(
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
reinterpret_cast<ck::half_t*>(wei_grad_naive_dev.GetDeviceBuffer()),
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
G,
N,
K,
C,
input_spatial,
filter_spatial,
output_spatial,
strides,
dilations,
left_pads);
// Compare
std::vector<ck::half_t> wei_grad_invoker_result(wei_grad_elements);
std::vector<ck::half_t> wei_grad_naive_result(wei_grad_elements);
wei_grad_invoker_dev.FromDevice(wei_grad_invoker_result.data());
wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data());
bool pass =
ck::utils::check_err(wei_grad_invoker_result,
wei_grad_naive_result,
"Error: Invoker Interface != Direct GPU Reference (Backward Weight)",
1e-6,
1e-6);
std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Weight)!"
<< std::endl;
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
EXPECT_TRUE(pass);
1e-6));
}
} // namespace

View File

@@ -62,8 +62,6 @@ TEST(ReferenceInstanceTraits, Forward_2D_FP16)
// Verify instance_string() - now includes data type and layouts!
std::string instance_str = Traits::instance_string();
EXPECT_EQ(instance_str, "GPU_Reference_Forward_2D_fp16_NHWGC_GKYXC_NHWGK");
std::cout << "✓ Forward InstanceTraits validated: " << instance_str << std::endl;
}
TEST(ReferenceInstanceTraits, BackwardData_2D_FP16)
@@ -86,8 +84,6 @@ TEST(ReferenceInstanceTraits, BackwardData_2D_FP16)
std::string instance_str = Traits::instance_string();
EXPECT_EQ(instance_str, "GPU_Reference_BackwardData_2D_fp16_NHWGC_GKYXC_NHWGK");
std::cout << "✓ Backward Data InstanceTraits validated: " << instance_str << std::endl;
}
TEST(ReferenceInstanceTraits, BackwardWeight_2D_FP16)
@@ -110,8 +106,6 @@ TEST(ReferenceInstanceTraits, BackwardWeight_2D_FP16)
std::string instance_str = Traits::instance_string();
EXPECT_EQ(instance_str, "GPU_Reference_BackwardWeight_2D_fp16_NHWGC_GKYXC_NHWGK");
std::cout << "✓ Backward Weight InstanceTraits validated: " << instance_str << std::endl;
}
} // namespace