mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-20 06:49:15 +00:00
[CK_BUILDER] Replace reference conv with old ck implementation (#3604)
* ck-builder: remove SPATIAL_DIM parameter from ConvTensorLayouts This information is already in the SIGNATURE, so its pointless to pass it separately. This streamlines the interface of those functions a bit. Also touches up the style of those files in general. * ck-builder: implement reference conv using old ck The old ck implementation is more featureful and better tested. * ck-builder: replace test_reference_execution reference with old ck This strips out the ck-tile gpu reference implementation completely. * ck-builder: clean up test_reference_execution - Remove unneccesary messages - Replace EXPECT_TRUE(true) with EXPECT_NO_THROW()
This commit is contained in:
@@ -23,7 +23,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvBwdWeightDlFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvBwdWeightMultiDWmmaV3Factory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvBwdWeightMultiDXdlFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvBwdWeightTwoStageWmmaV3Factory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvBwdWeightTwoStageXdlFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvBwdWeightWmmaFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvBwdWeightWmmaV3Factory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvBwdWeightXdlFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvBwdWeightXdlV3Factory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -24,7 +24,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvFwdDlFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvFwdLargeTensorFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvFwdXdlV3Factory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvFwdWmmaFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -26,7 +26,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvFwdXdlFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using Ops = internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -29,7 +29,7 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
struct ConvTileFactory
|
||||
{
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Layouts = internal::TileConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = internal::TileConvTensorLayouts<SIGNATURE>;
|
||||
using Types = internal::TileConvTensorTypes<SIGNATURE.data_type>;
|
||||
using Ops = internal::TileElementwiseOps<SIGNATURE>;
|
||||
using AlgorithmType = decltype(ALGORITHM);
|
||||
|
||||
@@ -172,10 +172,10 @@ struct LayoutToCK<TensorLayout::GNDHWK>
|
||||
using type = ck::tensor_layout::convolution::GNDHWK;
|
||||
};
|
||||
|
||||
template <TensorLayout Layout>
|
||||
template <TensorLayout LAYOUT>
|
||||
consteval auto TensorLayoutToCK()
|
||||
{
|
||||
return typename LayoutToCK<Layout>::type{};
|
||||
return typename LayoutToCK<LAYOUT>::type{};
|
||||
}
|
||||
|
||||
struct EmptyAuxiliaryTensorLayout
|
||||
@@ -183,49 +183,52 @@ struct EmptyAuxiliaryTensorLayout
|
||||
using type = ck::Tuple<>;
|
||||
};
|
||||
|
||||
template <auto AuxiliaryTensorConfigsArray, size_t... Indices>
|
||||
template <auto AUXILIARY_TENSOR_CONFIGS_ARRAY, size_t... Indices>
|
||||
consteval auto GetAuxiliaryTensorLayoutTuple(std::index_sequence<Indices...>)
|
||||
{
|
||||
return ck::Tuple<
|
||||
decltype(TensorLayoutToCK<AuxiliaryTensorConfigsArray[Indices].layout>())...>{};
|
||||
decltype(TensorLayoutToCK<AUXILIARY_TENSOR_CONFIGS_ARRAY[Indices].layout>())...>{};
|
||||
}
|
||||
|
||||
template <auto AuxiliaryTensorConfigsValue, size_t SPATIAL_DIM>
|
||||
template <auto AUXILIARY_TENSOR_CONFIGS_VALUE, size_t SPATIAL_DIM>
|
||||
requires(ConvSpatialDim<SPATIAL_DIM>)
|
||||
struct AuxiliaryTensorLayouts
|
||||
{
|
||||
static constexpr auto Size = AuxiliaryTensorConfigsValue.size();
|
||||
using type = decltype(GetAuxiliaryTensorLayoutTuple<AuxiliaryTensorConfigsValue>(
|
||||
static constexpr auto Size = AUXILIARY_TENSOR_CONFIGS_VALUE.size();
|
||||
using type = decltype(GetAuxiliaryTensorLayoutTuple<AUXILIARY_TENSOR_CONFIGS_VALUE>(
|
||||
std::make_index_sequence<Size>{}));
|
||||
};
|
||||
|
||||
// TODO: Currently only the ouput tensor can have auxiliary tensors (e.g., bias).
|
||||
template <auto Signature, size_t SPATIAL_DIM>
|
||||
requires(HasElementwiseOpWithAuxiliaryOperands<decltype(Signature.output)>)
|
||||
template <auto SIGNATURE>
|
||||
requires HasElementwiseOpWithAuxiliaryOperands<decltype(SIGNATURE.output)>
|
||||
consteval auto GetAuxiliaryTensorLayouts()
|
||||
{
|
||||
return AuxiliaryTensorLayouts<Signature.output.operation.auxiliary_operand_configs,
|
||||
SPATIAL_DIM>{};
|
||||
return AuxiliaryTensorLayouts<SIGNATURE.output.operation.auxiliary_operand_configs,
|
||||
SIGNATURE.spatial_dim>{};
|
||||
}
|
||||
|
||||
template <auto Signature, size_t SPATIAL_DIM>
|
||||
requires(!HasElementwiseOpWithAuxiliaryOperands<decltype(Signature.output)>)
|
||||
template <auto SIGNATURE>
|
||||
requires(!HasElementwiseOpWithAuxiliaryOperands<decltype(SIGNATURE.output)>)
|
||||
consteval auto GetAuxiliaryTensorLayouts()
|
||||
{
|
||||
return EmptyAuxiliaryTensorLayout{};
|
||||
}
|
||||
|
||||
template <auto Signature, size_t SPATIAL_DIM>
|
||||
requires(ConvSpatialDim<SPATIAL_DIM> &&
|
||||
ValidConvInputLayoutForSpatialDim<Signature.input.config.layout, SPATIAL_DIM> &&
|
||||
ValidConvWeightLayoutForSpatialDim<Signature.weight.config.layout, SPATIAL_DIM> &&
|
||||
ValidConvOutputLayoutForSpatialDim<Signature.output.config.layout, SPATIAL_DIM>)
|
||||
template <auto SIGNATURE>
|
||||
requires ConvSpatialDim<SIGNATURE.spatial_dim> &&
|
||||
ValidConvInputLayoutForSpatialDim<SIGNATURE.input.config.layout,
|
||||
SIGNATURE.spatial_dim> &&
|
||||
ValidConvWeightLayoutForSpatialDim<SIGNATURE.weight.config.layout,
|
||||
SIGNATURE.spatial_dim> &&
|
||||
ValidConvOutputLayoutForSpatialDim<SIGNATURE.output.config.layout,
|
||||
SIGNATURE.spatial_dim>
|
||||
struct ConvTensorLayouts
|
||||
{
|
||||
using InLayout = decltype(TensorLayoutToCK<Signature.input.config.layout>());
|
||||
using WeiLayout = decltype(TensorLayoutToCK<Signature.weight.config.layout>());
|
||||
using OutLayout = decltype(TensorLayoutToCK<Signature.output.config.layout>());
|
||||
using DsLayout = decltype(GetAuxiliaryTensorLayouts<Signature, SPATIAL_DIM>())::type;
|
||||
using InLayout = decltype(TensorLayoutToCK<SIGNATURE.input.config.layout>());
|
||||
using WeiLayout = decltype(TensorLayoutToCK<SIGNATURE.weight.config.layout>());
|
||||
using OutLayout = decltype(TensorLayoutToCK<SIGNATURE.output.config.layout>());
|
||||
using DsLayout = decltype(GetAuxiliaryTensorLayouts<SIGNATURE>())::type;
|
||||
};
|
||||
|
||||
} // namespace ck_tile::builder::factory::internal
|
||||
|
||||
@@ -9,10 +9,10 @@
|
||||
|
||||
namespace ck_tile::builder::factory::internal {
|
||||
using ALayout = ck_tile::tensor_layout::convolution::NWGC;
|
||||
template <TensorLayout Layout>
|
||||
template <TensorLayout LAYOUT>
|
||||
struct LayoutToCKTile
|
||||
{
|
||||
static_assert(sizeof(UnsupportedEnumValue<Layout>) == 0,
|
||||
static_assert(sizeof(UnsupportedEnumValue<LAYOUT>) == 0,
|
||||
"Unsupported layout conversion to CK.");
|
||||
};
|
||||
|
||||
@@ -152,49 +152,52 @@ struct EmptyAuxiliaryTileTensorLayout
|
||||
using type = ck_tile::tuple<>;
|
||||
};
|
||||
|
||||
template <auto AuxiliaryTileTensorConfigsArray, size_t... Indices>
|
||||
template <auto AUXILIARY_TILE_TENSOR_CONFIGS_ARRAY, size_t... Indices>
|
||||
consteval auto GetAuxiliaryTileTensorLayoutTuple(std::index_sequence<Indices...>)
|
||||
{
|
||||
return ck_tile::tuple<
|
||||
decltype(TensorLayoutToCKTile<AuxiliaryTileTensorConfigsArray[Indices].layout>())...>{};
|
||||
decltype(TensorLayoutToCKTile<AUXILIARY_TILE_TENSOR_CONFIGS_ARRAY[Indices].layout>())...>{};
|
||||
}
|
||||
|
||||
template <auto AuxiliaryTileTensorConfigsValue, size_t SPATIAL_DIM>
|
||||
requires(ConvSpatialDim<SPATIAL_DIM>)
|
||||
template <auto AUXILIARY_TILE_TENSOR_CONFIGS_VALUE, size_t SPATIAL_DIM>
|
||||
requires ConvSpatialDim<SPATIAL_DIM>
|
||||
struct AuxiliaryTileTensorLayouts
|
||||
{
|
||||
static constexpr auto Size = AuxiliaryTileTensorConfigsValue.size();
|
||||
using type = decltype(GetAuxiliaryTileTensorLayoutTuple<AuxiliaryTileTensorConfigsValue>(
|
||||
static constexpr auto Size = AUXILIARY_TILE_TENSOR_CONFIGS_VALUE.size();
|
||||
using type = decltype(GetAuxiliaryTileTensorLayoutTuple<AUXILIARY_TILE_TENSOR_CONFIGS_VALUE>(
|
||||
std::make_index_sequence<Size>{}));
|
||||
};
|
||||
|
||||
// TODO: Currently only the ouput tensor can have auxiliary tensors (e.g., bias).
|
||||
template <auto Signature, size_t SPATIAL_DIM>
|
||||
requires(HasElementwiseOpWithAuxiliaryOperands<decltype(Signature.output)>)
|
||||
template <auto SIGNATURE>
|
||||
requires HasElementwiseOpWithAuxiliaryOperands<decltype(SIGNATURE.output)>
|
||||
consteval auto GetAuxiliaryTileTensorLayouts()
|
||||
{
|
||||
return AuxiliaryTileTensorLayouts<Signature.output.operation.auxiliary_operand_configs,
|
||||
SPATIAL_DIM>{};
|
||||
return AuxiliaryTileTensorLayouts<SIGNATURE.output.operation.auxiliary_operand_configs,
|
||||
SIGNATURE.spatial_dim>{};
|
||||
}
|
||||
|
||||
template <auto Signature, size_t SPATIAL_DIM>
|
||||
requires(!HasElementwiseOpWithAuxiliaryOperands<decltype(Signature.output)>)
|
||||
template <auto SIGNATURE>
|
||||
requires(!HasElementwiseOpWithAuxiliaryOperands<decltype(SIGNATURE.output)>)
|
||||
consteval auto GetAuxiliaryTileTensorLayouts()
|
||||
{
|
||||
return EmptyAuxiliaryTileTensorLayout{};
|
||||
}
|
||||
|
||||
template <auto Signature, size_t SPATIAL_DIM>
|
||||
requires(ConvSpatialDim<SPATIAL_DIM> &&
|
||||
ValidConvInputLayoutForSpatialDim<Signature.input.config.layout, SPATIAL_DIM> &&
|
||||
ValidConvWeightLayoutForSpatialDim<Signature.weight.config.layout, SPATIAL_DIM> &&
|
||||
ValidConvOutputLayoutForSpatialDim<Signature.output.config.layout, SPATIAL_DIM>)
|
||||
template <auto SIGNATURE>
|
||||
requires ConvSpatialDim<SIGNATURE.spatial_dim> &&
|
||||
ValidConvInputLayoutForSpatialDim<SIGNATURE.input.config.layout,
|
||||
SIGNATURE.spatial_dim> &&
|
||||
ValidConvWeightLayoutForSpatialDim<SIGNATURE.weight.config.layout,
|
||||
SIGNATURE.spatial_dim> &&
|
||||
ValidConvOutputLayoutForSpatialDim<SIGNATURE.output.config.layout,
|
||||
SIGNATURE.spatial_dim>
|
||||
struct TileConvTensorLayouts
|
||||
{
|
||||
using ALayout = decltype(TensorLayoutToCKTile<Signature.input.config.layout>());
|
||||
using BLayout = decltype(TensorLayoutToCKTile<Signature.weight.config.layout>());
|
||||
using ELayout = decltype(TensorLayoutToCKTile<Signature.output.config.layout>());
|
||||
using DsLayout = decltype(GetAuxiliaryTileTensorLayouts<Signature, SPATIAL_DIM>())::type;
|
||||
using ALayout = decltype(TensorLayoutToCKTile<SIGNATURE.input.config.layout>());
|
||||
using BLayout = decltype(TensorLayoutToCKTile<SIGNATURE.weight.config.layout>());
|
||||
using ELayout = decltype(TensorLayoutToCKTile<SIGNATURE.output.config.layout>());
|
||||
using DsLayout = decltype(GetAuxiliaryTileTensorLayouts<SIGNATURE>())::type;
|
||||
};
|
||||
|
||||
} // namespace ck_tile::builder::factory::internal
|
||||
|
||||
@@ -1,118 +0,0 @@
|
||||
// Copyright (c) Advanced Micro Devices, Inc., or its affiliates.
|
||||
// SPDX-License-Identifier: MIT
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/builder/conv_signature_concepts.hpp"
|
||||
#include "ck_tile/builder/types.hpp"
|
||||
#include <vector>
|
||||
|
||||
namespace ck_tile::builder::factory::internal {
|
||||
|
||||
// Validation helper: Ensure reference implementation only receives PassThrough elementwise ops
|
||||
template <auto SIGNATURE>
|
||||
consteval void ValidateReferenceSignature()
|
||||
{
|
||||
using namespace ck_tile::builder;
|
||||
|
||||
// Check input elementwise operation
|
||||
static_assert(
|
||||
!HasTensorOp<decltype(SIGNATURE.input)> ||
|
||||
SIGNATURE.input.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH,
|
||||
"Reference implementation does not support elementwise operations on input tensor. "
|
||||
"Input operation must be PassThrough (or not specified).");
|
||||
|
||||
// Check weight elementwise operation
|
||||
static_assert(
|
||||
!HasTensorOp<decltype(SIGNATURE.weight)> ||
|
||||
SIGNATURE.weight.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH,
|
||||
"Reference implementation does not support elementwise operations on weight tensor. "
|
||||
"Weight operation must be PassThrough (or not specified).");
|
||||
|
||||
// Check output elementwise operation
|
||||
static_assert(
|
||||
!HasTensorOp<decltype(SIGNATURE.output)> ||
|
||||
SIGNATURE.output.operation.elementwise_operation == ElementwiseOperation::PASS_THROUGH,
|
||||
"Reference implementation does not support elementwise operations on output tensor. "
|
||||
"Output operation must be PassThrough (or not specified).");
|
||||
}
|
||||
|
||||
// Common argument structure for reference convolution implementations
|
||||
// Template parameters allow different const qualifiers for each direction
|
||||
template <typename InPtrType, typename WeiPtrType, typename OutPtrType>
|
||||
struct ReferenceConvArgument
|
||||
{
|
||||
InPtrType input_;
|
||||
WeiPtrType weight_;
|
||||
OutPtrType output_;
|
||||
int G_, N_, K_, C_;
|
||||
std::vector<ck_tile::long_index_t> input_spatial_;
|
||||
std::vector<ck_tile::long_index_t> filter_spatial_;
|
||||
std::vector<ck_tile::long_index_t> output_spatial_;
|
||||
std::vector<ck_tile::long_index_t> strides_;
|
||||
std::vector<ck_tile::long_index_t> dilations_;
|
||||
std::vector<ck_tile::long_index_t> left_pads_;
|
||||
|
||||
ReferenceConvArgument(InPtrType input,
|
||||
WeiPtrType weight,
|
||||
OutPtrType output,
|
||||
int G,
|
||||
int N,
|
||||
int K,
|
||||
int C,
|
||||
const std::vector<ck_tile::long_index_t>& input_spatial,
|
||||
const std::vector<ck_tile::long_index_t>& filter_spatial,
|
||||
const std::vector<ck_tile::long_index_t>& output_spatial,
|
||||
const std::vector<ck_tile::long_index_t>& strides,
|
||||
const std::vector<ck_tile::long_index_t>& dilations,
|
||||
const std::vector<ck_tile::long_index_t>& left_pads)
|
||||
: input_(input),
|
||||
weight_(weight),
|
||||
output_(output),
|
||||
G_(G),
|
||||
N_(N),
|
||||
K_(K),
|
||||
C_(C),
|
||||
input_spatial_(input_spatial),
|
||||
filter_spatial_(filter_spatial),
|
||||
output_spatial_(output_spatial),
|
||||
strides_(strides),
|
||||
dilations_(dilations),
|
||||
left_pads_(left_pads)
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
// Common invoker structure for reference convolution implementations
|
||||
// Takes a callable (lambda or function pointer) to execute the actual convolution
|
||||
template <typename ArgumentType, typename ConvFunc>
|
||||
struct ReferenceConvInvoker
|
||||
{
|
||||
ConvFunc conv_func_;
|
||||
|
||||
explicit ReferenceConvInvoker(ConvFunc func) : conv_func_(func) {}
|
||||
|
||||
float Run(const ArgumentType* arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
{
|
||||
(void)stream_config; // Unused for reference implementation
|
||||
|
||||
conv_func_(arg->input_,
|
||||
arg->weight_,
|
||||
arg->output_,
|
||||
arg->G_,
|
||||
arg->N_,
|
||||
arg->K_,
|
||||
arg->C_,
|
||||
arg->input_spatial_,
|
||||
arg->filter_spatial_,
|
||||
arg->output_spatial_,
|
||||
arg->strides_,
|
||||
arg->dilations_,
|
||||
arg->left_pads_);
|
||||
|
||||
return 0.0f; // Reference implementation doesn't track timing
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace ck_tile::builder::factory::internal
|
||||
@@ -3,15 +3,15 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp"
|
||||
#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp"
|
||||
#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp"
|
||||
#include "ck_tile/builder/conv_signature_concepts.hpp"
|
||||
#include "ck_tile/builder/conv_algorithm_concepts.hpp"
|
||||
#include "ck_tile/builder/types.hpp"
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_type.hpp"
|
||||
#include "ck_tile/builder/factory/reference_common.hpp"
|
||||
#include "ck_tile/core.hpp"
|
||||
#include "ck_tile/builder/factory/helpers/ck/conv_tensor_layout.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
|
||||
#include "ck/library/utility/convolution_parameter.hpp"
|
||||
#include <memory>
|
||||
|
||||
namespace ck_tile::builder::factory {
|
||||
@@ -22,16 +22,23 @@ template <ConvSignatureDescriptor auto SIGNATURE,
|
||||
StringLiteral VERSION>
|
||||
struct ReferenceFactory
|
||||
{
|
||||
// Validate that only PassThrough elementwise operations are specified
|
||||
static constexpr auto kValidation = (internal::ValidateReferenceSignature<SIGNATURE>(), 0);
|
||||
|
||||
static constexpr size_t SPATIAL_DIM = SIGNATURE.spatial_dim;
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
|
||||
using Types = internal::ConvTensorDataTypes<SIGNATURE>;
|
||||
using InDataType = typename Types::InDataType;
|
||||
using WeiDataType = typename Types::WeiDataType;
|
||||
using OutDataType = typename Types::OutDataType;
|
||||
|
||||
using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE>;
|
||||
using InLayout = typename Layouts::InLayout;
|
||||
using WeiLayout = typename Layouts::WeiLayout;
|
||||
using OutLayout = typename Layouts::OutLayout;
|
||||
|
||||
using Ops = factory::internal::ConvElementwiseOps<SIGNATURE>;
|
||||
using InElementwiseOp = typename Ops::InElementwiseOp;
|
||||
using WeiElementwiseOp = typename Ops::WeiElementwiseOp;
|
||||
using OutElementwiseOp = typename Ops::OutElementwiseOp;
|
||||
|
||||
struct Instance
|
||||
{
|
||||
// Store template parameters for InstanceTraits reflection
|
||||
@@ -39,91 +46,57 @@ struct ReferenceFactory
|
||||
static constexpr auto kAlgorithm = ALGORITHM;
|
||||
static constexpr auto kVersion = VERSION;
|
||||
|
||||
// Argument and Invoker types depend on direction
|
||||
// Forward: const input, const weight, mutable output
|
||||
// Backward Data: mutable input, const weight, const output_grad
|
||||
// Backward Weight: const input, mutable weight_grad, const output_grad
|
||||
|
||||
// Use appropriate Argument type based on direction
|
||||
using Argument = std::conditional_t<
|
||||
ConvDirectionIsForward<SIGNATURE>,
|
||||
internal::ReferenceConvArgument<const InDataType*, const WeiDataType*, OutDataType*>,
|
||||
std::conditional_t<
|
||||
ConvDirectionIsBackwardData<SIGNATURE>,
|
||||
internal::
|
||||
ReferenceConvArgument<InDataType*, const WeiDataType*, const OutDataType*>,
|
||||
internal::
|
||||
ReferenceConvArgument<const InDataType*, WeiDataType*, const OutDataType*>>>;
|
||||
|
||||
// Invoker calls the appropriate reference implementation based on direction
|
||||
struct Invoker
|
||||
/// @brief Invoke reference convolution
|
||||
///
|
||||
/// This is the primary overload to invoke reference convolution. As the underlying
|
||||
/// function requires it, this function accepts ConvParam directly.
|
||||
template <typename InPtrType, typename WeiPtrType, typename OutPtrType>
|
||||
static void Run(InPtrType* input,
|
||||
WeiPtrType* weight,
|
||||
OutPtrType* output,
|
||||
const ck::utils::conv::ConvParam& param,
|
||||
InElementwiseOp in_op = InElementwiseOp{},
|
||||
WeiElementwiseOp wei_op = WeiElementwiseOp{},
|
||||
OutElementwiseOp out_op = OutElementwiseOp{})
|
||||
{
|
||||
float Run(const Argument* arg, const StreamConfig& stream_config = StreamConfig{})
|
||||
if constexpr(ConvDirectionIsForward<SIGNATURE>)
|
||||
{
|
||||
(void)stream_config; // Unused for reference implementation
|
||||
|
||||
if constexpr(ConvDirectionIsForward<SIGNATURE>)
|
||||
{
|
||||
ck_tile::
|
||||
naive_grouped_conv_fwd<SPATIAL_DIM, InDataType, WeiDataType, OutDataType>(
|
||||
arg->input_,
|
||||
arg->weight_,
|
||||
arg->output_,
|
||||
arg->G_,
|
||||
arg->N_,
|
||||
arg->K_,
|
||||
arg->C_,
|
||||
arg->input_spatial_,
|
||||
arg->filter_spatial_,
|
||||
arg->output_spatial_,
|
||||
arg->strides_,
|
||||
arg->dilations_,
|
||||
arg->left_pads_);
|
||||
}
|
||||
else if constexpr(ConvDirectionIsBackwardData<SIGNATURE>)
|
||||
{
|
||||
ck_tile::naive_grouped_conv_bwd_data<SPATIAL_DIM,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType>(arg->input_,
|
||||
arg->weight_,
|
||||
arg->output_,
|
||||
arg->G_,
|
||||
arg->N_,
|
||||
arg->K_,
|
||||
arg->C_,
|
||||
arg->input_spatial_,
|
||||
arg->filter_spatial_,
|
||||
arg->output_spatial_,
|
||||
arg->strides_,
|
||||
arg->dilations_,
|
||||
arg->left_pads_);
|
||||
}
|
||||
else if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
|
||||
{
|
||||
ck_tile::naive_grouped_conv_bwd_weight<SPATIAL_DIM,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType>(arg->input_,
|
||||
arg->weight_,
|
||||
arg->output_,
|
||||
arg->G_,
|
||||
arg->N_,
|
||||
arg->K_,
|
||||
arg->C_,
|
||||
arg->input_spatial_,
|
||||
arg->filter_spatial_,
|
||||
arg->output_spatial_,
|
||||
arg->strides_,
|
||||
arg->dilations_,
|
||||
arg->left_pads_);
|
||||
}
|
||||
|
||||
return 0.0f; // Reference implementation doesn't track timing
|
||||
ck::ref::naive_conv_fwd<InLayout, WeiLayout, OutLayout>(
|
||||
static_cast<const InDataType*>(input),
|
||||
static_cast<const WeiDataType*>(weight),
|
||||
static_cast<OutDataType*>(output),
|
||||
param,
|
||||
in_op,
|
||||
wei_op,
|
||||
out_op);
|
||||
}
|
||||
};
|
||||
else if constexpr(ConvDirectionIsBackwardData<SIGNATURE>)
|
||||
{
|
||||
ck::ref::naive_conv_bwd_data<InLayout, WeiLayout, OutLayout>(
|
||||
static_cast<InDataType*>(input),
|
||||
static_cast<const WeiDataType*>(weight),
|
||||
static_cast<const OutDataType*>(output),
|
||||
param,
|
||||
in_op,
|
||||
wei_op,
|
||||
out_op);
|
||||
}
|
||||
else if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
|
||||
{
|
||||
ck::ref::naive_conv_bwd_weight<InLayout, WeiLayout, OutLayout>(
|
||||
static_cast<const InDataType*>(input),
|
||||
static_cast<WeiDataType*>(weight),
|
||||
static_cast<const OutDataType*>(output),
|
||||
param,
|
||||
in_op,
|
||||
wei_op,
|
||||
out_op);
|
||||
}
|
||||
}
|
||||
|
||||
// Direct Run method (simpler interface, direction-agnostic)
|
||||
/// @brief Invoke reference convolution
|
||||
///
|
||||
/// Convenience overload to avoid having to construct ConvParam manually.
|
||||
template <typename InPtrType, typename WeiPtrType, typename OutPtrType>
|
||||
static void Run(InPtrType* input,
|
||||
WeiPtrType* weight,
|
||||
@@ -132,68 +105,27 @@ struct ReferenceFactory
|
||||
int N,
|
||||
int K,
|
||||
int C,
|
||||
const std::vector<ck_tile::long_index_t>& input_spatial,
|
||||
const std::vector<ck_tile::long_index_t>& filter_spatial,
|
||||
const std::vector<ck_tile::long_index_t>& output_spatial,
|
||||
const std::vector<ck_tile::long_index_t>& strides,
|
||||
const std::vector<ck_tile::long_index_t>& dilations,
|
||||
const std::vector<ck_tile::long_index_t>& left_pads)
|
||||
const std::vector<ck::long_index_t>& input_spatial,
|
||||
const std::vector<ck::long_index_t>& filter_spatial,
|
||||
const std::vector<ck::long_index_t>& strides,
|
||||
const std::vector<ck::long_index_t>& dilations,
|
||||
const std::vector<ck::long_index_t>& left_pads,
|
||||
const std::vector<ck::long_index_t>& right_pads)
|
||||
{
|
||||
if constexpr(ConvDirectionIsForward<SIGNATURE>)
|
||||
{
|
||||
ck_tile::naive_grouped_conv_fwd<SPATIAL_DIM, InDataType, WeiDataType, OutDataType>(
|
||||
static_cast<const InDataType*>(input),
|
||||
static_cast<const WeiDataType*>(weight),
|
||||
static_cast<OutDataType*>(output),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
}
|
||||
else if constexpr(ConvDirectionIsBackwardData<SIGNATURE>)
|
||||
{
|
||||
ck_tile::
|
||||
naive_grouped_conv_bwd_data<SPATIAL_DIM, InDataType, WeiDataType, OutDataType>(
|
||||
static_cast<InDataType*>(input),
|
||||
static_cast<const WeiDataType*>(weight),
|
||||
static_cast<const OutDataType*>(output),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
}
|
||||
else if constexpr(ConvDirectionIsBackwardWeight<SIGNATURE>)
|
||||
{
|
||||
ck_tile::naive_grouped_conv_bwd_weight<SPATIAL_DIM,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
OutDataType>(
|
||||
static_cast<const InDataType*>(input),
|
||||
static_cast<WeiDataType*>(weight),
|
||||
static_cast<const OutDataType*>(output),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
}
|
||||
Run(input,
|
||||
weight,
|
||||
output,
|
||||
ck::utils::conv::ConvParam(SPATIAL_DIM,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
filter_spatial,
|
||||
input_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads,
|
||||
right_pads));
|
||||
}
|
||||
|
||||
std::string GetTypeString() const
|
||||
@@ -209,41 +141,6 @@ struct ReferenceFactory
|
||||
return std::string("GPU_Reference_") + dir_str + "_" + std::to_string(SPATIAL_DIM) +
|
||||
"D";
|
||||
}
|
||||
|
||||
// Old CK interface: Create argument pointer
|
||||
template <typename InPtrType, typename WeiPtrType, typename OutPtrType>
|
||||
std::unique_ptr<Argument>
|
||||
MakeArgumentPointer(InPtrType input,
|
||||
WeiPtrType weight,
|
||||
OutPtrType output,
|
||||
int G,
|
||||
int N,
|
||||
int K,
|
||||
int C,
|
||||
const std::vector<ck_tile::long_index_t>& input_spatial,
|
||||
const std::vector<ck_tile::long_index_t>& filter_spatial,
|
||||
const std::vector<ck_tile::long_index_t>& output_spatial,
|
||||
const std::vector<ck_tile::long_index_t>& strides,
|
||||
const std::vector<ck_tile::long_index_t>& dilations,
|
||||
const std::vector<ck_tile::long_index_t>& left_pads) const
|
||||
{
|
||||
return std::make_unique<Argument>(input,
|
||||
weight,
|
||||
output,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
}
|
||||
|
||||
// Old CK interface: Create invoker pointer
|
||||
std::unique_ptr<Invoker> MakeInvokerPointer() const { return std::make_unique<Invoker>(); }
|
||||
};
|
||||
};
|
||||
|
||||
|
||||
@@ -76,7 +76,7 @@ struct Args<SIGNATURE>
|
||||
using Ops = factory::internal::ConvElementwiseOps<SIGNATURE>;
|
||||
|
||||
// TODO: We shouldn't need to call into an internal namespace here.
|
||||
using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE, SPATIAL_DIM>;
|
||||
using Layouts = factory::internal::ConvTensorLayouts<SIGNATURE>;
|
||||
|
||||
ConvTensorLengths<SPATIAL_DIM> lengths;
|
||||
|
||||
|
||||
@@ -32,27 +32,8 @@ concept RefConvInstance = requires(Conv& conv,
|
||||
const void* input,
|
||||
const void* weight,
|
||||
void* output,
|
||||
int G,
|
||||
int N,
|
||||
int K,
|
||||
int C,
|
||||
std::vector<long_index_t> dims) {
|
||||
{
|
||||
conv.Run(input,
|
||||
weight,
|
||||
output,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
dims, // input_spatial
|
||||
dims, // filter_spatial
|
||||
dims, // output_spatial
|
||||
dims, // strides
|
||||
dims, // dilations
|
||||
dims // left_pads
|
||||
)
|
||||
};
|
||||
ck::utils::conv::ConvParam param) {
|
||||
{ conv.Run(input, weight, output, param) };
|
||||
};
|
||||
|
||||
/// @brief `run()` specialization for forward convolution and the reference
|
||||
@@ -84,16 +65,6 @@ std::tuple<bool, float> run(RefConvInstance<SIGNATURE> auto& conv,
|
||||
// Just throw for now, but regard these as TODO items that should be resolved
|
||||
// eventually.
|
||||
|
||||
// Right pads are not supported right now for some reason.
|
||||
for(auto right_pad : param.input_right_pads_)
|
||||
{
|
||||
if(right_pad != 0)
|
||||
{
|
||||
std::cout << "TODO: Support right pad in reference conv" << std::endl;
|
||||
return std::make_tuple(false, 0.0f);
|
||||
}
|
||||
}
|
||||
|
||||
if(!args.make_input_descriptor().is_packed())
|
||||
{
|
||||
std::cout << "TODO: Support non-packed input tensor in reference conv" << std::endl;
|
||||
@@ -110,19 +81,7 @@ std::tuple<bool, float> run(RefConvInstance<SIGNATURE> auto& conv,
|
||||
return std::make_tuple(false, 0.0f);
|
||||
}
|
||||
|
||||
conv.Run(inputs.input,
|
||||
inputs.weight,
|
||||
outputs.output,
|
||||
param.G_,
|
||||
param.N_,
|
||||
param.K_,
|
||||
param.C_,
|
||||
param.input_spatial_lengths_,
|
||||
param.filter_spatial_lengths_,
|
||||
param.output_spatial_lengths_,
|
||||
param.conv_filter_strides_,
|
||||
param.conv_filter_dilations_,
|
||||
param.input_left_pads_);
|
||||
conv.Run(inputs.input, inputs.weight, outputs.output, param);
|
||||
return std::make_tuple(true, 0.0f);
|
||||
}
|
||||
|
||||
|
||||
@@ -38,7 +38,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_NWGC_GKXC_NWGK)
|
||||
.weight = {.config = {.layout = GKXC}},
|
||||
.output = {.config = {.layout = NWGK}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 1>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NWGC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKXC>));
|
||||
@@ -57,7 +57,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_NGCW_GKXC_NGKW)
|
||||
.weight = {.config = {.layout = GKXC}},
|
||||
.output = {.config = {.layout = NGKW}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 1>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCW>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKXC>));
|
||||
@@ -76,7 +76,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_GNWC_GKXC_GNWK)
|
||||
.weight = {.config = {.layout = GKXC}},
|
||||
.output = {.config = {.layout = GNWK}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 1>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::GNWC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKXC>));
|
||||
@@ -95,7 +95,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor1D_NGCW_GKCX_NGKW)
|
||||
.weight = {.config = {.layout = GKCX}},
|
||||
.output = {.config = {.layout = NGKW}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 1>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCW>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKCX>));
|
||||
@@ -114,7 +114,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_NGCHW_GKYXC_NGKHW)
|
||||
.weight = {.config = {.layout = GKYXC}},
|
||||
.output = {.config = {.layout = NGKHW}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 2>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCHW>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
|
||||
@@ -133,7 +133,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_NHWGC_GKYXC_NHWGK)
|
||||
.weight = {.config = {.layout = GKYXC}},
|
||||
.output = {.config = {.layout = NHWGK}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 2>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NHWGC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
|
||||
@@ -152,7 +152,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_GNHWC_GKYXC_GNHWK)
|
||||
.weight = {.config = {.layout = GKYXC}},
|
||||
.output = {.config = {.layout = GNHWK}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 2>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::GNHWC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
|
||||
@@ -171,7 +171,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor2D_NGCHW_GKCYX_NGKHW)
|
||||
.weight = {.config = {.layout = GKCYX}},
|
||||
.output = {.config = {.layout = NGKHW}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 2>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCHW>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKCYX>));
|
||||
@@ -190,7 +190,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor3D_NGCDHW_GKCZYX_NGKDHW)
|
||||
.weight = {.config = {.layout = GKCZYX}},
|
||||
.output = {.config = {.layout = NGKDHW}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 3>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCDHW>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKCZYX>));
|
||||
@@ -209,7 +209,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor3D_NDHWGC_GKZYXC_NDHWGK)
|
||||
.weight = {.config = {.layout = GKZYXC}},
|
||||
.output = {.config = {.layout = NDHWGK}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 3>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NDHWGC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKZYXC>));
|
||||
@@ -228,7 +228,7 @@ TEST(ConvTensorLayout, AssignsLayoutsFor3D_GNDHWC_GKZYXC_GNDHWK)
|
||||
.weight = {.config = {.layout = GKZYXC}},
|
||||
.output = {.config = {.layout = GNDHWK}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 3>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::GNDHWC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKZYXC>));
|
||||
@@ -387,7 +387,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv2DWithSingleBiasG_K)
|
||||
.operation =
|
||||
OutputOp{.elementwise_operation = ElementwiseOperation::SCALE}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 2>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NGCHW>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
|
||||
@@ -414,7 +414,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv2DWithSingleBiasGC)
|
||||
.operation =
|
||||
OutputOp{.elementwise_operation = ElementwiseOperation::SCALE}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 2>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NHWGC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
|
||||
@@ -442,7 +442,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv2DWithTwoAuxiliaryTensors)
|
||||
.operation = OutputOp{.elementwise_operation =
|
||||
ElementwiseOperation::SCALEADD_SCALEADD_RELU}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 2>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::GNHWC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKYXC>));
|
||||
@@ -470,7 +470,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv1DWithBias)
|
||||
.operation =
|
||||
OutputOp{.elementwise_operation = ElementwiseOperation::SCALE}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 1>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NWGC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKXC>));
|
||||
@@ -497,7 +497,7 @@ TEST(ConvTensorLayoutsWithAuxiliary, Conv3DWithBias)
|
||||
.operation = OutputOp{.elementwise_operation =
|
||||
ElementwiseOperation::BIAS_BNORM_CLAMP}}};
|
||||
|
||||
using TensorLayouts = ConvTensorLayouts<sig, 3>;
|
||||
using TensorLayouts = ConvTensorLayouts<sig>;
|
||||
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::InLayout, ck::tensor_layout::convolution::NDHWGC>));
|
||||
EXPECT_TRUE((std::is_same_v<TensorLayouts::WeiLayout, ck::tensor_layout::convolution::GKZYXC>));
|
||||
|
||||
@@ -4,10 +4,10 @@
|
||||
#include "ck_tile/builder/conv_builder.hpp"
|
||||
#include "ck_tile/builder/types.hpp"
|
||||
#include "impl/conv_algorithm_types.hpp"
|
||||
#include "ck_tile/ref/naive_grouped_conv_fwd_gpu.hpp"
|
||||
#include "ck_tile/ref/naive_grouped_conv_bwd_data_gpu.hpp"
|
||||
#include "ck_tile/ref/naive_grouped_conv_bwd_weight_gpu.hpp"
|
||||
#include "utils/ckb_conv_test_configs.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_fwd_gpu.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_weight_gpu.hpp"
|
||||
#include "ck/library/reference_tensor_operation/gpu/naive_conv_bwd_data_gpu.hpp"
|
||||
#include "ck/library/utility/device_memory.hpp"
|
||||
#include "ck/library/utility/check_err.hpp"
|
||||
#include <gtest/gtest.h>
|
||||
@@ -53,29 +53,25 @@ TEST(ReferenceExecution, Forward_2D_FP16)
|
||||
// Prepare parameters for Run()
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
std::vector<ck_tile::long_index_t> right_pads{1, 1};
|
||||
|
||||
RefKernel ref_kernel;
|
||||
ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(out_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
// If we get here, Run() worked!
|
||||
std::cout << "✓ Reference Forward kernel executed!" << std::endl;
|
||||
EXPECT_TRUE(true);
|
||||
EXPECT_NO_THROW(ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(out_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads,
|
||||
right_pads));
|
||||
}
|
||||
|
||||
TEST(ReferenceExecution, BackwardData_2D_FP16)
|
||||
@@ -109,28 +105,26 @@ TEST(ReferenceExecution, BackwardData_2D_FP16)
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
std::vector<ck_tile::long_index_t> right_pads{1, 1};
|
||||
|
||||
RefKernel ref_kernel;
|
||||
ref_kernel.Run(reinterpret_cast<ck::half_t*>(in_grad_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
std::cout << "✓ Reference Backward Data kernel executed!" << std::endl;
|
||||
EXPECT_TRUE(true);
|
||||
EXPECT_NO_THROW(
|
||||
ref_kernel.Run(reinterpret_cast<ck::half_t*>(in_grad_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads,
|
||||
right_pads));
|
||||
}
|
||||
|
||||
TEST(ReferenceExecution, BackwardWeight_2D_FP16)
|
||||
@@ -164,217 +158,26 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16)
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
std::vector<ck_tile::long_index_t> right_pads{1, 1};
|
||||
|
||||
RefKernel ref_kernel;
|
||||
ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(wei_grad_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
std::cout << "✓ Reference Backward Weight kernel executed!" << std::endl;
|
||||
EXPECT_TRUE(true);
|
||||
}
|
||||
|
||||
// Test the old CK interface: MakeArgumentPointer + MakeInvokerPointer
|
||||
TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface)
|
||||
{
|
||||
constexpr ConvSignature sig{.spatial_dim = 2,
|
||||
.direction = ConvDirection::BACKWARD_DATA,
|
||||
.data_type = DataType::FP16,
|
||||
.accumulation_data_type = DataType::FP32,
|
||||
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
||||
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
||||
|
||||
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
|
||||
|
||||
const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t);
|
||||
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
||||
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
||||
|
||||
ck::DeviceMem in_grad_dev(in_grad_size);
|
||||
ck::DeviceMem wei_dev(wei_size);
|
||||
ck::DeviceMem out_grad_dev(out_grad_size);
|
||||
|
||||
in_grad_dev.SetZero();
|
||||
wei_dev.SetZero();
|
||||
out_grad_dev.SetZero();
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
|
||||
RefKernel ref_kernel;
|
||||
|
||||
// TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer)
|
||||
auto argument_ptr = ref_kernel.MakeArgumentPointer(
|
||||
reinterpret_cast<ck::half_t*>(in_grad_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
auto invoker_ptr = ref_kernel.MakeInvokerPointer();
|
||||
|
||||
// Run using invoker
|
||||
float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
std::cout << "✓ Reference Backward Data kernel executed via Invoker interface!" << std::endl;
|
||||
std::cout << " (time = " << time << " ms)" << std::endl;
|
||||
EXPECT_TRUE(true);
|
||||
}
|
||||
|
||||
// Test the old CK interface for Forward convolution
|
||||
TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface)
|
||||
{
|
||||
constexpr ConvSignature sig{.spatial_dim = 2,
|
||||
.direction = ConvDirection::FORWARD,
|
||||
.data_type = DataType::FP16,
|
||||
.accumulation_data_type = DataType::FP32,
|
||||
.input = {.config = {.layout = TensorLayout::GNHWC}},
|
||||
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = TensorLayout::GNHWK}}};
|
||||
|
||||
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
||||
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
||||
|
||||
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
|
||||
|
||||
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
||||
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
||||
const size_t out_size = G * N * K * H * W * sizeof(ck::half_t);
|
||||
|
||||
ck::DeviceMem in_dev(in_size);
|
||||
ck::DeviceMem wei_dev(wei_size);
|
||||
ck::DeviceMem out_dev(out_size);
|
||||
|
||||
in_dev.SetZero();
|
||||
wei_dev.SetZero();
|
||||
out_dev.SetZero();
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
|
||||
RefKernel ref_kernel;
|
||||
|
||||
// TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer)
|
||||
auto argument_ptr = ref_kernel.MakeArgumentPointer(
|
||||
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(out_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
auto invoker_ptr = ref_kernel.MakeInvokerPointer();
|
||||
|
||||
// Run using invoker
|
||||
float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
std::cout << "✓ Reference Forward kernel executed via Invoker interface!" << std::endl;
|
||||
std::cout << " (time = " << time << " ms)" << std::endl;
|
||||
EXPECT_TRUE(true);
|
||||
}
|
||||
|
||||
// Test the old CK interface for Backward Weight convolution
|
||||
TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface)
|
||||
{
|
||||
constexpr ConvSignature sig{.spatial_dim = 2,
|
||||
.direction = ConvDirection::BACKWARD_WEIGHT,
|
||||
.data_type = DataType::FP16,
|
||||
.accumulation_data_type = DataType::FP32,
|
||||
.input = {.config = {.layout = TensorLayout::GNHWC}},
|
||||
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = TensorLayout::GNHWK}}};
|
||||
|
||||
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
||||
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
||||
|
||||
const int G = 1, N = 2, C = 4, K = 4, H = 3, W = 3;
|
||||
|
||||
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
||||
const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
||||
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
||||
|
||||
ck::DeviceMem in_dev(in_size);
|
||||
ck::DeviceMem wei_grad_dev(wei_grad_size);
|
||||
ck::DeviceMem out_grad_dev(out_grad_size);
|
||||
|
||||
in_dev.SetZero();
|
||||
wei_grad_dev.SetZero();
|
||||
out_grad_dev.SetZero();
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
|
||||
RefKernel ref_kernel;
|
||||
|
||||
// TEST: Use the old CK interface (MakeArgumentPointer + MakeInvokerPointer)
|
||||
auto argument_ptr = ref_kernel.MakeArgumentPointer(
|
||||
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(wei_grad_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
auto invoker_ptr = ref_kernel.MakeInvokerPointer();
|
||||
|
||||
// Run using invoker
|
||||
float time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
std::cout << "✓ Reference Backward Weight kernel executed via Invoker interface!" << std::endl;
|
||||
std::cout << " (time = " << time << " ms)" << std::endl;
|
||||
EXPECT_TRUE(true);
|
||||
EXPECT_NO_THROW(
|
||||
ref_kernel.Run(reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(wei_grad_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads,
|
||||
right_pads));
|
||||
}
|
||||
|
||||
// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT
|
||||
@@ -430,10 +233,10 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random)
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
std::vector<ck_tile::long_index_t> right_pads{1, 1};
|
||||
|
||||
RefKernel builder_kernel;
|
||||
|
||||
@@ -447,26 +250,35 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random)
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
left_pads,
|
||||
right_pads);
|
||||
|
||||
// Run 2: Direct GPU Reference (same kernel the Builder calls internally!)
|
||||
ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>(
|
||||
ck::ref::naive_conv_fwd<ck::tensor_layout::convolution::NHWGC,
|
||||
ck::tensor_layout::convolution::GKYXC,
|
||||
ck::tensor_layout::convolution::NHWGK,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough>(
|
||||
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(out_naive_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
ck::utils::conv::ConvParam(2,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
filter_spatial,
|
||||
input_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads,
|
||||
right_pads));
|
||||
|
||||
// Copy results back
|
||||
std::vector<ck::half_t> out_builder_result(out_elements);
|
||||
@@ -475,17 +287,11 @@ TEST(ReferenceExecution, Forward_2D_FP16_Builder_vs_DirectGPUReference_Random)
|
||||
out_naive_dev.FromDevice(out_naive_result.data());
|
||||
|
||||
// Compare - should be IDENTICAL (both call same kernel)
|
||||
bool pass = ck::utils::check_err(out_builder_result,
|
||||
EXPECT_TRUE(ck::utils::check_err(out_builder_result,
|
||||
out_naive_result,
|
||||
"Error: Builder Reference != Direct GPU Reference",
|
||||
1e-6,
|
||||
1e-6); // Very tight tolerance!
|
||||
|
||||
std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT)!" << std::endl;
|
||||
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
||||
std::cout << " This validates Builder Reference Factory is correct!" << std::endl;
|
||||
|
||||
EXPECT_TRUE(pass);
|
||||
1e-6)); // Very tight tolerance!
|
||||
}
|
||||
|
||||
// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Data
|
||||
@@ -538,10 +344,10 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
std::vector<ck_tile::long_index_t> right_pads{1, 1};
|
||||
|
||||
RefKernel builder_kernel;
|
||||
|
||||
@@ -555,26 +361,35 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
left_pads,
|
||||
right_pads);
|
||||
|
||||
// Run 2: Direct GPU Reference
|
||||
ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>(
|
||||
ck::ref::naive_conv_bwd_data<ck::tensor_layout::convolution::NHWGC,
|
||||
ck::tensor_layout::convolution::GKYXC,
|
||||
ck::tensor_layout::convolution::NHWGK,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough>(
|
||||
reinterpret_cast<ck::half_t*>(in_grad_naive_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
ck::utils::conv::ConvParam(2,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
filter_spatial,
|
||||
input_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads,
|
||||
right_pads));
|
||||
|
||||
// Compare
|
||||
std::vector<ck::half_t> in_grad_builder_result(in_grad_elements);
|
||||
@@ -582,16 +397,11 @@ TEST(ReferenceExecution, BackwardData_2D_FP16_Builder_vs_DirectGPUReference_Rand
|
||||
in_grad_builder_dev.FromDevice(in_grad_builder_result.data());
|
||||
in_grad_naive_dev.FromDevice(in_grad_naive_result.data());
|
||||
|
||||
bool pass = ck::utils::check_err(in_grad_builder_result,
|
||||
EXPECT_TRUE(ck::utils::check_err(in_grad_builder_result,
|
||||
in_grad_naive_result,
|
||||
"Error: Builder Backward Data != Direct GPU Reference",
|
||||
1e-6,
|
||||
1e-6);
|
||||
|
||||
std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Data)!"
|
||||
<< std::endl;
|
||||
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
||||
EXPECT_TRUE(pass);
|
||||
1e-6));
|
||||
}
|
||||
|
||||
// Test Builder Reference vs Direct GPU Reference with RANDOM INPUT - Backward Weight
|
||||
@@ -644,10 +454,10 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
std::vector<ck_tile::long_index_t> right_pads{1, 1};
|
||||
|
||||
RefKernel builder_kernel;
|
||||
|
||||
@@ -661,26 +471,35 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
left_pads,
|
||||
right_pads);
|
||||
|
||||
// Run 2: Direct GPU Reference
|
||||
ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>(
|
||||
ck::ref::naive_conv_bwd_weight<ck::tensor_layout::convolution::NHWGC,
|
||||
ck::tensor_layout::convolution::GKYXC,
|
||||
ck::tensor_layout::convolution::NHWGK,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::half_t,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough,
|
||||
ck::tensor_operation::element_wise::PassThrough>(
|
||||
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(wei_grad_naive_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
ck::utils::conv::ConvParam(2,
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
filter_spatial,
|
||||
input_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads,
|
||||
right_pads));
|
||||
|
||||
// Compare
|
||||
std::vector<ck::half_t> wei_grad_builder_result(wei_grad_elements);
|
||||
@@ -688,344 +507,11 @@ TEST(ReferenceExecution, BackwardWeight_2D_FP16_Builder_vs_DirectGPUReference_Ra
|
||||
wei_grad_builder_dev.FromDevice(wei_grad_builder_result.data());
|
||||
wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data());
|
||||
|
||||
bool pass = ck::utils::check_err(wei_grad_builder_result,
|
||||
EXPECT_TRUE(ck::utils::check_err(wei_grad_builder_result,
|
||||
wei_grad_naive_result,
|
||||
"Error: Builder Backward Weight != Direct GPU Reference",
|
||||
1e-6,
|
||||
1e-6);
|
||||
|
||||
std::cout << "✓ Builder Reference vs Direct GPU Reference (RANDOM INPUT - Backward Weight)!"
|
||||
<< std::endl;
|
||||
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Forward
|
||||
TEST(ReferenceExecution, Forward_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random)
|
||||
{
|
||||
constexpr ConvSignature sig{.spatial_dim = 2,
|
||||
.direction = ConvDirection::FORWARD,
|
||||
.data_type = DataType::FP16,
|
||||
.accumulation_data_type = DataType::FP32,
|
||||
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
||||
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
||||
|
||||
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
|
||||
|
||||
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
||||
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
||||
const size_t out_size = G * N * K * H * W * sizeof(ck::half_t);
|
||||
|
||||
const size_t in_elements = G * N * C * H * W;
|
||||
const size_t wei_elements = G * K * C * 3 * 3;
|
||||
const size_t out_elements = G * N * K * H * W;
|
||||
|
||||
std::vector<ck::half_t> in_host(in_elements);
|
||||
std::vector<ck::half_t> wei_host(wei_elements);
|
||||
|
||||
std::srand(12348);
|
||||
for(size_t i = 0; i < in_elements; i++)
|
||||
{
|
||||
in_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
||||
}
|
||||
for(size_t i = 0; i < wei_elements; i++)
|
||||
{
|
||||
wei_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
||||
}
|
||||
|
||||
ck::DeviceMem in_dev(in_size);
|
||||
ck::DeviceMem wei_dev(wei_size);
|
||||
ck::DeviceMem out_invoker_dev(out_size);
|
||||
ck::DeviceMem out_naive_dev(out_size);
|
||||
|
||||
in_dev.ToDevice(in_host.data());
|
||||
wei_dev.ToDevice(wei_host.data());
|
||||
out_invoker_dev.SetZero();
|
||||
out_naive_dev.SetZero();
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
|
||||
RefKernel builder_kernel;
|
||||
|
||||
// Run 1: Builder Invoker Interface
|
||||
auto argument_ptr = builder_kernel.MakeArgumentPointer(
|
||||
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(out_invoker_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
auto invoker_ptr = builder_kernel.MakeInvokerPointer();
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
// Run 2: Direct GPU Reference
|
||||
ck_tile::naive_grouped_conv_fwd<2, ck::half_t, ck::half_t, ck::half_t>(
|
||||
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(out_naive_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
// Compare
|
||||
std::vector<ck::half_t> out_invoker_result(out_elements);
|
||||
std::vector<ck::half_t> out_naive_result(out_elements);
|
||||
out_invoker_dev.FromDevice(out_invoker_result.data());
|
||||
out_naive_dev.FromDevice(out_naive_result.data());
|
||||
|
||||
bool pass = ck::utils::check_err(out_invoker_result,
|
||||
out_naive_result,
|
||||
"Error: Invoker Interface != Direct GPU Reference",
|
||||
1e-6,
|
||||
1e-6);
|
||||
|
||||
std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Forward)!" << std::endl;
|
||||
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Data
|
||||
TEST(ReferenceExecution, BackwardData_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random)
|
||||
{
|
||||
constexpr ConvSignature sig{.spatial_dim = 2,
|
||||
.direction = ConvDirection::BACKWARD_DATA,
|
||||
.data_type = DataType::FP16,
|
||||
.accumulation_data_type = DataType::FP32,
|
||||
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
||||
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
||||
|
||||
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
|
||||
|
||||
const size_t in_grad_size = G * N * C * H * W * sizeof(ck::half_t);
|
||||
const size_t wei_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
||||
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
||||
|
||||
const size_t in_grad_elements = G * N * C * H * W;
|
||||
const size_t wei_elements = G * K * C * 3 * 3;
|
||||
const size_t out_grad_elements = G * N * K * H * W;
|
||||
|
||||
std::vector<ck::half_t> wei_host(wei_elements);
|
||||
std::vector<ck::half_t> out_grad_host(out_grad_elements);
|
||||
|
||||
std::srand(12349);
|
||||
for(size_t i = 0; i < wei_elements; i++)
|
||||
{
|
||||
wei_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
||||
}
|
||||
for(size_t i = 0; i < out_grad_elements; i++)
|
||||
{
|
||||
out_grad_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
||||
}
|
||||
|
||||
ck::DeviceMem in_grad_invoker_dev(in_grad_size);
|
||||
ck::DeviceMem in_grad_naive_dev(in_grad_size);
|
||||
ck::DeviceMem wei_dev(wei_size);
|
||||
ck::DeviceMem out_grad_dev(out_grad_size);
|
||||
|
||||
wei_dev.ToDevice(wei_host.data());
|
||||
out_grad_dev.ToDevice(out_grad_host.data());
|
||||
in_grad_invoker_dev.SetZero();
|
||||
in_grad_naive_dev.SetZero();
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
|
||||
RefKernel builder_kernel;
|
||||
|
||||
// Run 1: Builder Invoker Interface
|
||||
auto argument_ptr = builder_kernel.MakeArgumentPointer(
|
||||
reinterpret_cast<ck::half_t*>(in_grad_invoker_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
auto invoker_ptr = builder_kernel.MakeInvokerPointer();
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
// Run 2: Direct GPU Reference
|
||||
ck_tile::naive_grouped_conv_bwd_data<2, ck::half_t, ck::half_t, ck::half_t>(
|
||||
reinterpret_cast<ck::half_t*>(in_grad_naive_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(wei_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
// Compare
|
||||
std::vector<ck::half_t> in_grad_invoker_result(in_grad_elements);
|
||||
std::vector<ck::half_t> in_grad_naive_result(in_grad_elements);
|
||||
in_grad_invoker_dev.FromDevice(in_grad_invoker_result.data());
|
||||
in_grad_naive_dev.FromDevice(in_grad_naive_result.data());
|
||||
|
||||
bool pass =
|
||||
ck::utils::check_err(in_grad_invoker_result,
|
||||
in_grad_naive_result,
|
||||
"Error: Invoker Interface != Direct GPU Reference (Backward Data)",
|
||||
1e-6,
|
||||
1e-6);
|
||||
|
||||
std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Data)!"
|
||||
<< std::endl;
|
||||
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
// Test Invoker Interface vs Direct GPU Reference with RANDOM INPUT - Backward Weight
|
||||
TEST(ReferenceExecution, BackwardWeight_2D_FP16_InvokerInterface_vs_DirectGPUReference_Random)
|
||||
{
|
||||
constexpr ConvSignature sig{.spatial_dim = 2,
|
||||
.direction = ConvDirection::BACKWARD_WEIGHT,
|
||||
.data_type = DataType::FP16,
|
||||
.accumulation_data_type = DataType::FP32,
|
||||
.input = {.config = {.layout = TensorLayout::NHWGC}},
|
||||
.weight = {.config = {.layout = TensorLayout::GKYXC}},
|
||||
.output = {.config = {.layout = TensorLayout::NHWGK}}};
|
||||
|
||||
constexpr auto ref_alg = ConvAlgorithm_Reference{};
|
||||
using RefKernel = ConvBuilder<sig, ref_alg>::Instance;
|
||||
|
||||
const int G = 1, N = 2, C = 16, K = 16, H = 14, W = 14;
|
||||
|
||||
const size_t in_size = G * N * C * H * W * sizeof(ck::half_t);
|
||||
const size_t wei_grad_size = G * K * C * 3 * 3 * sizeof(ck::half_t);
|
||||
const size_t out_grad_size = G * N * K * H * W * sizeof(ck::half_t);
|
||||
|
||||
const size_t in_elements = G * N * C * H * W;
|
||||
const size_t wei_grad_elements = G * K * C * 3 * 3;
|
||||
const size_t out_grad_elements = G * N * K * H * W;
|
||||
|
||||
std::vector<ck::half_t> in_host(in_elements);
|
||||
std::vector<ck::half_t> out_grad_host(out_grad_elements);
|
||||
|
||||
std::srand(12350);
|
||||
for(size_t i = 0; i < in_elements; i++)
|
||||
{
|
||||
in_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
||||
}
|
||||
for(size_t i = 0; i < out_grad_elements; i++)
|
||||
{
|
||||
out_grad_host[i] = ck::half_t(static_cast<float>(std::rand()) / RAND_MAX * 2.0f - 1.0f);
|
||||
}
|
||||
|
||||
ck::DeviceMem in_dev(in_size);
|
||||
ck::DeviceMem wei_grad_invoker_dev(wei_grad_size);
|
||||
ck::DeviceMem wei_grad_naive_dev(wei_grad_size);
|
||||
ck::DeviceMem out_grad_dev(out_grad_size);
|
||||
|
||||
in_dev.ToDevice(in_host.data());
|
||||
out_grad_dev.ToDevice(out_grad_host.data());
|
||||
wei_grad_invoker_dev.SetZero();
|
||||
wei_grad_naive_dev.SetZero();
|
||||
|
||||
std::vector<ck_tile::long_index_t> input_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> filter_spatial{3, 3};
|
||||
std::vector<ck_tile::long_index_t> output_spatial{H, W};
|
||||
std::vector<ck_tile::long_index_t> strides{1, 1};
|
||||
std::vector<ck_tile::long_index_t> dilations{1, 1};
|
||||
std::vector<ck_tile::long_index_t> left_pads{1, 1};
|
||||
|
||||
RefKernel builder_kernel;
|
||||
|
||||
// Run 1: Builder Invoker Interface
|
||||
auto argument_ptr = builder_kernel.MakeArgumentPointer(
|
||||
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(wei_grad_invoker_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
auto invoker_ptr = builder_kernel.MakeInvokerPointer();
|
||||
invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
|
||||
|
||||
// Run 2: Direct GPU Reference
|
||||
ck_tile::naive_grouped_conv_bwd_weight<2, ck::half_t, ck::half_t, ck::half_t>(
|
||||
reinterpret_cast<const ck::half_t*>(in_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<ck::half_t*>(wei_grad_naive_dev.GetDeviceBuffer()),
|
||||
reinterpret_cast<const ck::half_t*>(out_grad_dev.GetDeviceBuffer()),
|
||||
G,
|
||||
N,
|
||||
K,
|
||||
C,
|
||||
input_spatial,
|
||||
filter_spatial,
|
||||
output_spatial,
|
||||
strides,
|
||||
dilations,
|
||||
left_pads);
|
||||
|
||||
// Compare
|
||||
std::vector<ck::half_t> wei_grad_invoker_result(wei_grad_elements);
|
||||
std::vector<ck::half_t> wei_grad_naive_result(wei_grad_elements);
|
||||
wei_grad_invoker_dev.FromDevice(wei_grad_invoker_result.data());
|
||||
wei_grad_naive_dev.FromDevice(wei_grad_naive_result.data());
|
||||
|
||||
bool pass =
|
||||
ck::utils::check_err(wei_grad_invoker_result,
|
||||
wei_grad_naive_result,
|
||||
"Error: Invoker Interface != Direct GPU Reference (Backward Weight)",
|
||||
1e-6,
|
||||
1e-6);
|
||||
|
||||
std::cout << "✓ Invoker Interface vs Direct GPU Reference (RANDOM - Backward Weight)!"
|
||||
<< std::endl;
|
||||
std::cout << " Result: " << (pass ? "IDENTICAL ✓" : "MISMATCH ✗") << std::endl;
|
||||
EXPECT_TRUE(pass);
|
||||
1e-6));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
@@ -62,8 +62,6 @@ TEST(ReferenceInstanceTraits, Forward_2D_FP16)
|
||||
// Verify instance_string() - now includes data type and layouts!
|
||||
std::string instance_str = Traits::instance_string();
|
||||
EXPECT_EQ(instance_str, "GPU_Reference_Forward_2D_fp16_NHWGC_GKYXC_NHWGK");
|
||||
|
||||
std::cout << "✓ Forward InstanceTraits validated: " << instance_str << std::endl;
|
||||
}
|
||||
|
||||
TEST(ReferenceInstanceTraits, BackwardData_2D_FP16)
|
||||
@@ -86,8 +84,6 @@ TEST(ReferenceInstanceTraits, BackwardData_2D_FP16)
|
||||
|
||||
std::string instance_str = Traits::instance_string();
|
||||
EXPECT_EQ(instance_str, "GPU_Reference_BackwardData_2D_fp16_NHWGC_GKYXC_NHWGK");
|
||||
|
||||
std::cout << "✓ Backward Data InstanceTraits validated: " << instance_str << std::endl;
|
||||
}
|
||||
|
||||
TEST(ReferenceInstanceTraits, BackwardWeight_2D_FP16)
|
||||
@@ -110,8 +106,6 @@ TEST(ReferenceInstanceTraits, BackwardWeight_2D_FP16)
|
||||
|
||||
std::string instance_str = Traits::instance_string();
|
||||
EXPECT_EQ(instance_str, "GPU_Reference_BackwardWeight_2D_fp16_NHWGC_GKYXC_NHWGK");
|
||||
|
||||
std::cout << "✓ Backward Weight InstanceTraits validated: " << instance_str << std::endl;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
Reference in New Issue
Block a user