mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-14 02:02:46 +00:00
Padded Generic Kernel Instance (#730)
* Add NumReduceDim template parameter to DeviceSoftmax and Softmax client API to simplify instances collecting * Move the generic kernel instance to be the first of the instance list for elementwise op of normalization * Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax * Add testing of GetGenericInstance() in client_example of Softmax * Revert "Add testing of GetGenericInstance() in client_example of Softmax" This reverts commitf629cd9a93. * Revert "Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax" This reverts commita9f0d000eb. * Support generic kernel instance to be the first instance returned by GetInstances() for GroupNorm * Move generic kernel instance to separate tuple for elementwise op of normalization * Remove un-used files for softmax instance * Store generic kernel instance to separate tuple for softmax * Add IsSupported checking for generic instance to client example of softmax * Replace the get_device_normalize_from_mean_meansquare_instances() by the DeviceOperationInstanceFactory class for elementwise-normalization * clang-format fix * Remove int8 from softmax instances --------- Co-authored-by: zjing14 <zhangjing14@gmail.com> [ROCm/composable_kernel commit:0d9118226b]
This commit is contained in:
@@ -172,18 +172,19 @@ int main()
|
||||
BLayout,
|
||||
CLayout>();
|
||||
|
||||
const auto normalize_ptrs =
|
||||
ck::tensor_operation::device::instance::get_device_normalize_from_mean_meansquare_instances<
|
||||
CDataType,
|
||||
ReduceDataType,
|
||||
ReduceDataType,
|
||||
GammaDataType,
|
||||
BetaDataType,
|
||||
LayerNormOutDataType>();
|
||||
|
||||
std::cout << "found " << gemm_reduce_ptrs.size()
|
||||
<< " gemm_reduceMean_reduceSquareMean instances" << std::endl;
|
||||
|
||||
using NormalizeDeviceOp = ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<CDataType, ReduceDataType, ReduceDataType, GammaDataType, BetaDataType>,
|
||||
ck::Tuple<LayerNormOutDataType>,
|
||||
ck::tensor_operation::element_wise::Normalize,
|
||||
2>;
|
||||
|
||||
const auto normalize_ptrs =
|
||||
ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
NormalizeDeviceOp>::GetInstances();
|
||||
|
||||
std::cout << "found " << normalize_ptrs.size() << " normalize instances" << std::endl;
|
||||
|
||||
auto f_matrix_space_size =
|
||||
|
||||
@@ -53,12 +53,35 @@ int main(int argc, char* argv[])
|
||||
SimpleDeviceMem in(sizeof(InDataType) * num_elements);
|
||||
SimpleDeviceMem out(sizeof(OutDataType) * num_elements);
|
||||
|
||||
using DeviceOp = ck::tensor_operation::device::
|
||||
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
|
||||
using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
// get device op instances
|
||||
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
DeviceOp>::GetInstances();
|
||||
|
||||
auto& generic_op_ptr = op_ptrs[0];
|
||||
|
||||
auto generic_argument_ptr = generic_op_ptr->MakeArgumentPointer(in_lengths,
|
||||
in_strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta,
|
||||
in.GetDeviceBuffer(),
|
||||
out.GetDeviceBuffer(),
|
||||
PassThrough{},
|
||||
PassThrough{});
|
||||
|
||||
if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The generic kernel instance should be able to support any input shapes");
|
||||
};
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
std::string best_op_name;
|
||||
@@ -74,11 +97,6 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
auto& op_ptr = op_ptrs[i];
|
||||
|
||||
if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
|
||||
in_strides,
|
||||
reduce_dims,
|
||||
|
||||
@@ -72,6 +72,30 @@ int main(int argc, char* argv[])
|
||||
|
||||
std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
|
||||
|
||||
const auto& generic_op_ptr = op_ptrs[0];
|
||||
|
||||
auto generic_argument_ptr =
|
||||
generic_op_ptr->MakeArgumentPointer({N, H, W, G, C}, // lengths
|
||||
xy_strides, // xStrides
|
||||
gamma_beta_strides, // gammaStrides
|
||||
gamma_beta_strides, // betaStrides
|
||||
xy_strides, // yStrides
|
||||
{1, 2, 4}, // reduceDims
|
||||
1e-6,
|
||||
x_device_buf.GetDeviceBuffer(),
|
||||
gamma_device_buf.GetDeviceBuffer(),
|
||||
beta_device_buf.GetDeviceBuffer(),
|
||||
y_device_buf.GetDeviceBuffer(),
|
||||
nullptr,
|
||||
nullptr,
|
||||
Swish{});
|
||||
|
||||
if(!generic_op_ptr->IsSupportedArgument(generic_argument_ptr.get()))
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"The generic kernel instance should be able to support any input shapes");
|
||||
};
|
||||
|
||||
std::string best_op_name;
|
||||
bool found = false;
|
||||
int best_op_id = -1;
|
||||
|
||||
@@ -18,7 +18,8 @@ template <typename InDataType,
|
||||
typename OutDataType,
|
||||
typename InElementwiseOp,
|
||||
typename AccElementwiseOp,
|
||||
index_t Rank>
|
||||
index_t Rank,
|
||||
index_t NumReduceDim>
|
||||
struct DeviceSoftmax : public BaseOperator
|
||||
{
|
||||
//
|
||||
@@ -49,8 +50,6 @@ struct DeviceSoftmax : public BaseOperator
|
||||
AccElementwiseOp acc_elementwise_op) = 0;
|
||||
|
||||
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
|
||||
virtual index_t GetRank() const = 0;
|
||||
virtual index_t GetNumReduceDim() const = 0;
|
||||
};
|
||||
|
||||
template <typename InDataType,
|
||||
@@ -58,9 +57,15 @@ template <typename InDataType,
|
||||
typename OutDataType,
|
||||
typename InElementwiseOp,
|
||||
typename AccElementwiseOp,
|
||||
index_t Rank>
|
||||
using DeviceSoftmaxPtr = std::unique_ptr<
|
||||
DeviceSoftmax<InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank>>;
|
||||
index_t Rank,
|
||||
index_t NumReduceDim>
|
||||
using DeviceSoftmaxPtr = std::unique_ptr<DeviceSoftmax<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
InElementwiseOp,
|
||||
AccElementwiseOp,
|
||||
Rank,
|
||||
NumReduceDim>>;
|
||||
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
|
||||
OutDataType,
|
||||
InElementwiseOp,
|
||||
AccElementwiseOp,
|
||||
Rank>
|
||||
Rank,
|
||||
NumReduceDim>
|
||||
{
|
||||
static constexpr index_t kRank = Rank;
|
||||
static constexpr index_t kNumReduceDim = NumReduceDim;
|
||||
static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;
|
||||
|
||||
virtual index_t GetRank() const override { return kRank; }
|
||||
|
||||
virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
|
||||
|
||||
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
|
||||
|
||||
static constexpr index_t NumSrcDim = Rank;
|
||||
@@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
|
||||
{
|
||||
if constexpr(InSrcVectorDim == 0)
|
||||
{
|
||||
if constexpr(kNumInvariantDim == 0)
|
||||
if constexpr(NumInvariantDim == 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
|
||||
if(arg.inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
|
||||
}
|
||||
|
||||
// To improve
|
||||
if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
|
||||
if(NumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -5,11 +5,10 @@
|
||||
|
||||
#include <vector>
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -29,20 +28,34 @@ template <typename InputType,
|
||||
typename GammaDataType,
|
||||
typename BetaDataType,
|
||||
typename OutputType>
|
||||
auto get_device_normalize_from_mean_meansquare_instances()
|
||||
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
|
||||
ck::Tuple<OutputType>,
|
||||
Normalize,
|
||||
2>>
|
||||
{
|
||||
std::vector<DeviceNormalizeFromMeanMeanSquarePtr> op_ptrs;
|
||||
using DeviceOp = DeviceElementwise<
|
||||
ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
|
||||
ck::Tuple<OutputType>,
|
||||
Normalize,
|
||||
2>;
|
||||
|
||||
if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
|
||||
is_same<MeanSquareType, float>::value && is_same<GammaDataType, half_t>::value &&
|
||||
is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
|
||||
static auto GetInstances()
|
||||
{
|
||||
ck::tensor_operation::device::instance::
|
||||
add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
|
||||
}
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
|
||||
return op_ptrs;
|
||||
}
|
||||
if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
|
||||
is_same<MeanSquareType, float>::value &&
|
||||
is_same<GammaDataType, half_t>::value &&
|
||||
is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
|
||||
{
|
||||
ck::tensor_operation::device::instance::
|
||||
add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
|
||||
}
|
||||
|
||||
return op_ptrs;
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -9,34 +9,33 @@
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
|
||||
void add_device_softmax_f16_f16_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
|
||||
void add_device_softmax_f32_f32_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>&);
|
||||
void add_device_softmax_i8_i8_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>&);
|
||||
|
||||
template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
|
||||
struct DeviceOperationInstanceFactory<
|
||||
ck::tensor_operation::device::
|
||||
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim>
|
||||
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftmax<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>>
|
||||
{
|
||||
using DeviceOp =
|
||||
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
|
||||
using DeviceOp = DeviceSoftmax<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
@@ -46,25 +45,49 @@ struct DeviceOperationInstanceFactory<
|
||||
std::is_same_v<OutDataType, F16>)
|
||||
{
|
||||
if constexpr(Rank == 3)
|
||||
add_device_softmax_f16_f16_rank3_instances(op_ptrs);
|
||||
{
|
||||
if constexpr(NumReduceDim == 1)
|
||||
add_device_softmax_f16_f16_rank3_reduce1_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 2)
|
||||
add_device_softmax_f16_f16_rank3_reduce2_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 3)
|
||||
add_device_softmax_f16_f16_rank3_reduce3_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(Rank == 4)
|
||||
add_device_softmax_f16_f16_rank4_instances(op_ptrs);
|
||||
{
|
||||
if constexpr(NumReduceDim == 1)
|
||||
add_device_softmax_f16_f16_rank4_reduce1_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 2)
|
||||
add_device_softmax_f16_f16_rank4_reduce2_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 3)
|
||||
add_device_softmax_f16_f16_rank4_reduce3_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 4)
|
||||
add_device_softmax_f16_f16_rank4_reduce4_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
|
||||
std::is_same_v<OutDataType, F32>)
|
||||
{
|
||||
if constexpr(Rank == 3)
|
||||
add_device_softmax_f32_f32_rank3_instances(op_ptrs);
|
||||
{
|
||||
if constexpr(NumReduceDim == 1)
|
||||
add_device_softmax_f32_f32_rank3_reduce1_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 2)
|
||||
add_device_softmax_f32_f32_rank3_reduce2_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 3)
|
||||
add_device_softmax_f32_f32_rank3_reduce3_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(Rank == 4)
|
||||
add_device_softmax_f32_f32_rank4_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
|
||||
std::is_same_v<OutDataType, I8>)
|
||||
{
|
||||
if constexpr(Rank == 3)
|
||||
add_device_softmax_i8_i8_rank3_instances(op_ptrs);
|
||||
else if constexpr(Rank == 4)
|
||||
add_device_softmax_i8_i8_rank4_instances(op_ptrs);
|
||||
{
|
||||
if constexpr(NumReduceDim == 1)
|
||||
add_device_softmax_f32_f32_rank4_reduce1_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 2)
|
||||
add_device_softmax_f32_f32_rank4_reduce2_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 3)
|
||||
add_device_softmax_f32_f32_rank4_reduce3_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 4)
|
||||
add_device_softmax_f32_f32_rank4_reduce4_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
|
||||
return op_ptrs;
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
void add_device_softmax_f16_f16_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -16,7 +16,6 @@ template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f16_f16_instances = std::tuple<
|
||||
// clang-format off
|
||||
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
|
||||
// fallback kernel
|
||||
DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>,
|
||||
DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>,
|
||||
DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>,
|
||||
@@ -33,6 +32,13 @@ using device_softmax_f16_f16_instances = std::tuple<
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f16_f16_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 64, 8, 8, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
|
||||
void add_device_softmax_f32_f32_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -16,7 +16,7 @@ template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f32_f32_instances = std::tuple<
|
||||
// clang-format off
|
||||
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>,
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
|
||||
@@ -32,6 +32,13 @@ using device_softmax_f32_f32_instances = std::tuple<
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f32_f32_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 64, 8, 8, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
void add_device_softmax_i8_i8_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,40 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_i8_i8_instances = std::tuple<
|
||||
// clang-format off
|
||||
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
|
||||
// fallback kernel
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 1, 1>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 16, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 16, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 32, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 64, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 16, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 32, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 64, 1, 16, 16>,
|
||||
// Reduction on middle dimensions
|
||||
// InSrcVectorDim is 0 since we want to coalesce reads on M dimension
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 8, 0, 1, 1>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 32, 8, 32, 8, 0, 16, 8>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -3,6 +3,17 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
|
||||
|
||||
@@ -30,7 +30,12 @@ using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std:
|
||||
//###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >
|
||||
// clang-format on
|
||||
>;
|
||||
@@ -39,6 +44,9 @@ void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
|
||||
std::vector<DeviceElementwisePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance{});
|
||||
|
||||
add_device_operation_instances(
|
||||
instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});
|
||||
}
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_generic_instance<Pass, 5, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 5, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f32_generic_instance<Pass, 5, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 5, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F32, F16, Swish, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_normalization_f16_f32_f32_f16_generic_instance<Swish, 5, 3>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_f32_f32_f16_instances<Swish, 5, 3>{});
|
||||
}
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Swish, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_generic_instance<Swish, 5, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f16_instances<Swish, 5, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f32_generic_instance<Swish, 5, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f32_instances<Swish, 5, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_generic_instance<Pass, 2, 1>{});
|
||||
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f32_generic_instance<Pass, 2, 1>{});
|
||||
add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 2, 1>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_generic_instance<Pass, 4, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f32_generic_instance<Pass, 4, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 4, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -43,6 +43,13 @@ using device_normalization_f16_instances =
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f16_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f32_instances = std::tuple<
|
||||
// clang-format off
|
||||
@@ -69,6 +76,13 @@ using device_normalization_f32_instances = std::tuple<
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f32_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f16_f32_f32_f16_instances = std::tuple<
|
||||
// clang-format off
|
||||
@@ -95,6 +109,13 @@ using device_normalization_f16_f32_f32_f16_instances = std::tuple<
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f16_f32_f32_f16_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -1,13 +1,4 @@
|
||||
add_instance_library(device_softmax_instance
|
||||
device_softmax_i8_i8_instance.cpp
|
||||
device_softmax_i8_i8_instance_rank3_reduce1.cpp
|
||||
device_softmax_i8_i8_instance_rank3_reduce2.cpp
|
||||
device_softmax_i8_i8_instance_rank3_reduce3.cpp
|
||||
device_softmax_i8_i8_instance_rank4_reduce1.cpp
|
||||
device_softmax_i8_i8_instance_rank4_reduce2.cpp
|
||||
device_softmax_i8_i8_instance_rank4_reduce3.cpp
|
||||
device_softmax_i8_i8_instance_rank4_reduce4.cpp
|
||||
device_softmax_f16_f16_instance.cpp
|
||||
device_softmax_f16_f16_instance_rank3_reduce1.cpp
|
||||
device_softmax_f16_f16_instance_rank3_reduce2.cpp
|
||||
device_softmax_f16_f16_instance_rank3_reduce3.cpp
|
||||
@@ -15,7 +6,6 @@ add_instance_library(device_softmax_instance
|
||||
device_softmax_f16_f16_instance_rank4_reduce2.cpp
|
||||
device_softmax_f16_f16_instance_rank4_reduce3.cpp
|
||||
device_softmax_f16_f16_instance_rank4_reduce4.cpp
|
||||
device_softmax_f32_f32_instance.cpp
|
||||
device_softmax_f32_f32_instance_rank3_reduce1.cpp
|
||||
device_softmax_f32_f32_instance_rank3_reduce2.cpp
|
||||
device_softmax_f32_f32_instance_rank3_reduce3.cpp
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances)
|
||||
{
|
||||
add_device_softmax_f16_f16_rank3_reduce1_instances(instances);
|
||||
add_device_softmax_f16_f16_rank3_reduce2_instances(instances);
|
||||
add_device_softmax_f16_f16_rank3_reduce3_instances(instances);
|
||||
}
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances)
|
||||
{
|
||||
add_device_softmax_f16_f16_rank4_reduce1_instances(instances);
|
||||
add_device_softmax_f16_f16_rank4_reduce2_instances(instances);
|
||||
add_device_softmax_f16_f16_rank4_reduce3_instances(instances);
|
||||
add_device_softmax_f16_f16_rank4_reduce4_instances(instances);
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 4>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 4>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
|
||||
{
|
||||
add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
|
||||
add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
|
||||
add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
|
||||
}
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
|
||||
{
|
||||
add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
|
||||
add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
|
||||
add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
|
||||
add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 4>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 4>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances)
|
||||
{
|
||||
add_device_softmax_i8_i8_rank3_reduce1_instances(instances);
|
||||
add_device_softmax_i8_i8_rank3_reduce2_instances(instances);
|
||||
add_device_softmax_i8_i8_rank3_reduce3_instances(instances);
|
||||
}
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances)
|
||||
{
|
||||
add_device_softmax_i8_i8_rank4_reduce1_instances(instances);
|
||||
add_device_softmax_i8_i8_rank4_reduce2_instances(instances);
|
||||
add_device_softmax_i8_i8_rank4_reduce3_instances(instances);
|
||||
add_device_softmax_i8_i8_rank4_reduce4_instances(instances);
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -40,7 +40,11 @@ template <> std::string type_to_string<int8_t>() { return "int8"; }
|
||||
template <> std::string type_to_string<int32_t>() { return "int32"; }
|
||||
// clang-format on
|
||||
|
||||
template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim>
|
||||
bool profile_softmax_impl(int do_verification,
|
||||
int init_method,
|
||||
bool do_log,
|
||||
@@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification,
|
||||
if(Rank != in_length.size())
|
||||
{
|
||||
throw std::runtime_error("Input tensor rank is different from template argument Rank!");
|
||||
}
|
||||
};
|
||||
|
||||
if(NumReduceDim != reduce_dims.size())
|
||||
{
|
||||
throw std::runtime_error(
|
||||
"Input reduce_dims rank is different from template argument NumReduceDim!");
|
||||
};
|
||||
|
||||
Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
|
||||
: Tensor<InDataType>(in_length, in_strides);
|
||||
@@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification,
|
||||
|
||||
// add device softmax instances
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using DeviceOp = tensor_operation::device::
|
||||
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
|
||||
using DeviceOp = tensor_operation::device::DeviceSoftmax<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
// get device op instances
|
||||
const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
|
||||
@@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification,
|
||||
|
||||
for(auto& inst_ptr : instances)
|
||||
{
|
||||
// Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
|
||||
// problem to rank 4 kernel) other than invoking IsSupportedArgument()?
|
||||
if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
|
||||
in_tensor_strides,
|
||||
reduce_dims,
|
||||
|
||||
@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
|
||||
{
|
||||
if(data_type == SoftmaxDataType::F16_F16)
|
||||
{
|
||||
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
if(reduce.size() == 1)
|
||||
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 1>(
|
||||
do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 2)
|
||||
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 2>(
|
||||
do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 3)
|
||||
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 3>(
|
||||
do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else
|
||||
throw std::runtime_error("invalid number of dimensions to reduce");
|
||||
}
|
||||
else if(data_type == SoftmaxDataType::F32_F32)
|
||||
{
|
||||
ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
if(reduce.size() == 1)
|
||||
ck::profiler::profile_softmax_impl<float, float, float, 3, 1>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 2)
|
||||
ck::profiler::profile_softmax_impl<float, float, float, 3, 2>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 3)
|
||||
ck::profiler::profile_softmax_impl<float, float, float, 3, 3>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else
|
||||
throw std::runtime_error("invalid number of dimensions to reduce");
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
|
||||
{
|
||||
if(data_type == SoftmaxDataType::F16_F16)
|
||||
{
|
||||
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
if(reduce.size() == 1)
|
||||
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 1>(
|
||||
do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 2)
|
||||
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 2>(
|
||||
do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 3)
|
||||
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 3>(
|
||||
do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 4)
|
||||
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 4>(
|
||||
do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else
|
||||
throw std::runtime_error("invalid number of dimensions to reduce");
|
||||
}
|
||||
else if(data_type == SoftmaxDataType::F32_F32)
|
||||
{
|
||||
ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
if(reduce.size() == 1)
|
||||
ck::profiler::profile_softmax_impl<float, float, float, 4, 1>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 2)
|
||||
ck::profiler::profile_softmax_impl<float, float, float, 4, 2>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 3)
|
||||
ck::profiler::profile_softmax_impl<float, float, float, 4, 3>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else if(reduce.size() == 4)
|
||||
ck::profiler::profile_softmax_impl<float, float, float, 4, 4>(do_verification,
|
||||
init_method,
|
||||
do_log,
|
||||
time_kernel,
|
||||
length,
|
||||
stride,
|
||||
reduce,
|
||||
double(alpha),
|
||||
double(beta));
|
||||
else
|
||||
throw std::runtime_error("invalid number of dimensions to reduce");
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
@@ -13,7 +13,6 @@ using I = ck::Number<N>;
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using I8 = int8_t;
|
||||
|
||||
template <typename Tuple>
|
||||
class TestSoftmax : public ck::TestSoftmax<Tuple>
|
||||
@@ -24,8 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
|
||||
using KernelTypes = ::testing::Types<
|
||||
// InDataType, AccDataType, OutDataType, Rank
|
||||
std::tuple< F16, F32, F16, I<3>>,
|
||||
std::tuple< F32, F32, F32, I<3>>,
|
||||
std::tuple< I8, F32, I8, I<3>>
|
||||
std::tuple< F32, F32, F32, I<3>>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
|
||||
@@ -13,7 +13,6 @@ using I = ck::Number<N>;
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
using I8 = int8_t;
|
||||
|
||||
template <typename Tuple>
|
||||
class TestSoftmax : public ck::TestSoftmax<Tuple>
|
||||
@@ -24,8 +23,7 @@ class TestSoftmax : public ck::TestSoftmax<Tuple>
|
||||
using KernelTypes = ::testing::Types<
|
||||
// InDataType, AccDataType, OutDataType, Rank
|
||||
std::tuple< F16, F32, F16, I<4>>,
|
||||
std::tuple< F32, F32, F32, I<4>>,
|
||||
std::tuple< I8, F32, I8, I<4>>
|
||||
std::tuple< F32, F32, F32, I<4>>
|
||||
>;
|
||||
// clang-format on
|
||||
|
||||
|
||||
@@ -61,8 +61,92 @@ class TestSoftmax : public ::testing::Test
|
||||
int init_method = 1; // integer value initialization
|
||||
bool log = false;
|
||||
std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
|
||||
bool pass = ck::profiler::profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank>(
|
||||
verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta);
|
||||
bool pass = false;
|
||||
|
||||
if constexpr(Rank == 3)
|
||||
{
|
||||
if(reduce_dims.size() == 1)
|
||||
pass = ck::profiler::
|
||||
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
|
||||
init_method,
|
||||
log,
|
||||
bench_,
|
||||
in_length,
|
||||
strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta);
|
||||
else if(reduce_dims.size() == 2)
|
||||
pass = ck::profiler::
|
||||
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
|
||||
init_method,
|
||||
log,
|
||||
bench_,
|
||||
in_length,
|
||||
strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta);
|
||||
else if(reduce_dims.size() == 3)
|
||||
pass = ck::profiler::
|
||||
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
|
||||
init_method,
|
||||
log,
|
||||
bench_,
|
||||
in_length,
|
||||
strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta);
|
||||
}
|
||||
else if constexpr(Rank == 4)
|
||||
{
|
||||
if(reduce_dims.size() == 1)
|
||||
pass = ck::profiler::
|
||||
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
|
||||
init_method,
|
||||
log,
|
||||
bench_,
|
||||
in_length,
|
||||
strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta);
|
||||
else if(reduce_dims.size() == 2)
|
||||
pass = ck::profiler::
|
||||
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
|
||||
init_method,
|
||||
log,
|
||||
bench_,
|
||||
in_length,
|
||||
strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta);
|
||||
else if(reduce_dims.size() == 3)
|
||||
pass = ck::profiler::
|
||||
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
|
||||
init_method,
|
||||
log,
|
||||
bench_,
|
||||
in_length,
|
||||
strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta);
|
||||
else if(reduce_dims.size() == 4)
|
||||
pass = ck::profiler::
|
||||
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(verify_,
|
||||
init_method,
|
||||
log,
|
||||
bench_,
|
||||
in_length,
|
||||
strides,
|
||||
reduce_dims,
|
||||
alpha,
|
||||
beta);
|
||||
};
|
||||
|
||||
EXPECT_TRUE(pass);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user