Add NumReduceDim template parameter to DeviceSoftmax and Softmax client API to simplify instances collecting

This commit is contained in:
Qianfeng Zhang
2023-05-30 21:17:54 +00:00
parent ac9e01e2cc
commit 240900b1c0
57 changed files with 441 additions and 398 deletions

View File

@@ -53,8 +53,13 @@ int main(int argc, char* argv[])
SimpleDeviceMem in(sizeof(InDataType) * num_elements);
SimpleDeviceMem out(sizeof(OutDataType) * num_elements);
using DeviceOp = ck::tensor_operation::device::
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
using DeviceOp = ck::tensor_operation::device::DeviceSoftmax<InDataType,
AccDataType,
OutDataType,
PassThrough,
PassThrough,
Rank,
NumReduceDim>;
// get device op instances
const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
DeviceOp>::GetInstances();
@@ -74,11 +79,6 @@ int main(int argc, char* argv[])
{
auto& op_ptr = op_ptrs[i];
if(op_ptr->GetRank() != Rank || op_ptr->GetNumReduceDim() != NumReduceDim)
{
continue;
}
auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths,
in_strides,
reduce_dims,

View File

@@ -18,7 +18,8 @@ template <typename InDataType,
typename OutDataType,
typename InElementwiseOp,
typename AccElementwiseOp,
index_t Rank>
index_t Rank,
index_t NumReduceDim>
struct DeviceSoftmax : public BaseOperator
{
//
@@ -49,8 +50,6 @@ struct DeviceSoftmax : public BaseOperator
AccElementwiseOp acc_elementwise_op) = 0;
virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
virtual index_t GetRank() const = 0;
virtual index_t GetNumReduceDim() const = 0;
};
template <typename InDataType,
@@ -58,9 +57,15 @@ template <typename InDataType,
typename OutDataType,
typename InElementwiseOp,
typename AccElementwiseOp,
index_t Rank>
using DeviceSoftmaxPtr = std::unique_ptr<
DeviceSoftmax<InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank>>;
index_t Rank,
index_t NumReduceDim>
using DeviceSoftmaxPtr = std::unique_ptr<DeviceSoftmax<InDataType,
AccDataType,
OutDataType,
InElementwiseOp,
AccElementwiseOp,
Rank,
NumReduceDim>>;
} // namespace device
} // namespace tensor_operation

View File

@@ -38,16 +38,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
OutDataType,
InElementwiseOp,
AccElementwiseOp,
Rank>
Rank,
NumReduceDim>
{
static constexpr index_t kRank = Rank;
static constexpr index_t kNumReduceDim = NumReduceDim;
static constexpr index_t kNumInvariantDim = Rank - NumReduceDim;
virtual index_t GetRank() const override { return kRank; }
virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
static constexpr index_t NumInvariantDim = Rank - NumReduceDim;
static constexpr index_t NumSrcDim = Rank;
@@ -287,13 +280,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
{
if constexpr(InSrcVectorDim == 0)
{
if constexpr(kNumInvariantDim == 0)
if constexpr(NumInvariantDim == 0)
{
return false;
}
else
{
if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
if(arg.inStrides_[NumInvariantDim - 1] != 1 && InSrcVectorSize != 1)
{
return false;
}
@@ -316,7 +309,7 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax<InDataType,
}
// To improve
if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
if(NumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0)
{
return false;
}

View File

@@ -9,34 +9,33 @@
#include "ck/ck.hpp"
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f16_f16_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
void add_device_softmax_f32_f32_rank3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
void add_device_softmax_f32_f32_rank4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
void add_device_softmax_i8_i8_rank3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>&);
void add_device_softmax_i8_i8_rank4_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>&);
template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
struct DeviceOperationInstanceFactory<
ck::tensor_operation::device::
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
template <typename InDataType,
typename AccDataType,
typename OutDataType,
index_t Rank,
index_t NumReduceDim>
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftmax<InDataType,
AccDataType,
OutDataType,
PassThrough,
PassThrough,
Rank,
NumReduceDim>>
{
using DeviceOp =
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
using DeviceOp = DeviceSoftmax<InDataType,
AccDataType,
OutDataType,
PassThrough,
PassThrough,
Rank,
NumReduceDim>;
static auto GetInstances()
{
@@ -46,25 +45,73 @@ struct DeviceOperationInstanceFactory<
std::is_same_v<OutDataType, F16>)
{
if constexpr(Rank == 3)
add_device_softmax_f16_f16_rank3_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_f16_f16_rank3_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_f16_f16_rank3_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_f16_f16_rank3_reduce3_instances(op_ptrs);
}
else if constexpr(Rank == 4)
add_device_softmax_f16_f16_rank4_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_f16_f16_rank4_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_f16_f16_rank4_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_f16_f16_rank4_reduce3_instances(op_ptrs);
else if constexpr(NumReduceDim == 4)
add_device_softmax_f16_f16_rank4_reduce4_instances(op_ptrs);
}
}
else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
std::is_same_v<OutDataType, F32>)
{
if constexpr(Rank == 3)
add_device_softmax_f32_f32_rank3_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_f32_f32_rank3_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_f32_f32_rank3_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_f32_f32_rank3_reduce3_instances(op_ptrs);
}
else if constexpr(Rank == 4)
add_device_softmax_f32_f32_rank4_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_f32_f32_rank4_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_f32_f32_rank4_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_f32_f32_rank4_reduce3_instances(op_ptrs);
else if constexpr(NumReduceDim == 4)
add_device_softmax_f32_f32_rank4_reduce4_instances(op_ptrs);
}
}
else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
std::is_same_v<OutDataType, I8>)
{
if constexpr(Rank == 3)
add_device_softmax_i8_i8_rank3_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_i8_i8_rank3_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_i8_i8_rank3_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_i8_i8_rank3_reduce3_instances(op_ptrs);
}
else if constexpr(Rank == 4)
add_device_softmax_i8_i8_rank4_instances(op_ptrs);
{
if constexpr(NumReduceDim == 1)
add_device_softmax_i8_i8_rank4_reduce1_instances(op_ptrs);
else if constexpr(NumReduceDim == 2)
add_device_softmax_i8_i8_rank4_reduce2_instances(op_ptrs);
else if constexpr(NumReduceDim == 3)
add_device_softmax_i8_i8_rank4_reduce3_instances(op_ptrs);
else if constexpr(NumReduceDim == 4)
add_device_softmax_i8_i8_rank4_reduce4_instances(op_ptrs);
}
}
return op_ptrs;

View File

@@ -1,22 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
void add_device_softmax_f16_f16_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank4_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances);
} // namespace instance
} // namespace device

View File

@@ -1,22 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
void add_device_softmax_f32_f32_rank4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank4_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances);
} // namespace instance
} // namespace device

View File

@@ -1,22 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
void add_device_softmax_i8_i8_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank4_reduce2_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances);
} // namespace instance
} // namespace device

View File

@@ -14,7 +14,7 @@ namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances);
} // namespace instance
} // namespace device

View File

@@ -3,6 +3,24 @@
#pragma once
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"

View File

@@ -1,5 +1,4 @@
add_instance_library(device_softmax_instance
device_softmax_i8_i8_instance.cpp
device_softmax_i8_i8_instance_rank3_reduce1.cpp
device_softmax_i8_i8_instance_rank3_reduce2.cpp
device_softmax_i8_i8_instance_rank3_reduce3.cpp
@@ -7,7 +6,6 @@ add_instance_library(device_softmax_instance
device_softmax_i8_i8_instance_rank4_reduce2.cpp
device_softmax_i8_i8_instance_rank4_reduce3.cpp
device_softmax_i8_i8_instance_rank4_reduce4.cpp
device_softmax_f16_f16_instance.cpp
device_softmax_f16_f16_instance_rank3_reduce1.cpp
device_softmax_f16_f16_instance_rank3_reduce2.cpp
device_softmax_f16_f16_instance_rank3_reduce3.cpp
@@ -15,7 +13,6 @@ add_instance_library(device_softmax_instance
device_softmax_f16_f16_instance_rank4_reduce2.cpp
device_softmax_f16_f16_instance_rank4_reduce3.cpp
device_softmax_f16_f16_instance_rank4_reduce4.cpp
device_softmax_f32_f32_instance.cpp
device_softmax_f32_f32_instance_rank3_reduce1.cpp
device_softmax_f32_f32_instance_rank3_reduce2.cpp
device_softmax_f32_f32_instance_rank3_reduce3.cpp

View File

@@ -1,40 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f16_f16_rank3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances)
{
add_device_softmax_f16_f16_rank3_reduce1_instances(instances);
add_device_softmax_f16_f16_rank3_reduce2_instances(instances);
add_device_softmax_f16_f16_rank3_reduce3_instances(instances);
}
void add_device_softmax_f16_f16_rank4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances)
{
add_device_softmax_f16_f16_rank4_reduce1_instances(instances);
add_device_softmax_f16_f16_rank4_reduce2_instances(instances);
add_device_softmax_f16_f16_rank4_reduce3_instances(instances);
add_device_softmax_f16_f16_rank4_reduce4_instances(instances);
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f16_f16_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f16_f16_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f16_f16_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f16_f16_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f16_f16_rank4_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f16_f16_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f16_f16_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances)
{
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 4>{});
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
}
} // namespace instance

View File

@@ -1,40 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_f32_f32_rank3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
{
add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
}
void add_device_softmax_f32_f32_rank4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
{
add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f32_f32_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f32_f32_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_f32_f32_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f32_f32_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f32_f32_rank4_reduce2_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f32_f32_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_f32_f32_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances)
{
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 4>{});
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
}
} // namespace instance

View File

@@ -1,40 +0,0 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
#include <vector>
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace instance {
void add_device_softmax_i8_i8_rank3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances)
{
add_device_softmax_i8_i8_rank3_reduce1_instances(instances);
add_device_softmax_i8_i8_rank3_reduce2_instances(instances);
add_device_softmax_i8_i8_rank3_reduce3_instances(instances);
}
void add_device_softmax_i8_i8_rank4_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances)
{
add_device_softmax_i8_i8_rank4_reduce1_instances(instances);
add_device_softmax_i8_i8_rank4_reduce2_instances(instances);
add_device_softmax_i8_i8_rank4_reduce3_instances(instances);
add_device_softmax_i8_i8_rank4_reduce4_instances(instances);
}
} // namespace instance
} // namespace device
} // namespace tensor_operation
} // namespace ck

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_i8_i8_rank3_reduce1_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 1>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_i8_i8_rank3_reduce2_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 2>>& instances)
{
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 2>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 3;
void add_device_softmax_i8_i8_rank3_reduce3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3, 3>>& instances)
{
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
add_device_operation_instances(instances, device_softmax_i8_i8_instances<3, 3>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_i8_i8_rank4_reduce1_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 1>>& instances)
{
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 1>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_i8_i8_rank4_reduce2_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 2>>& instances)
{
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 2>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_i8_i8_rank4_reduce3_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 3>>& instances)
{
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 3>{});
}
} // namespace instance

View File

@@ -13,12 +13,10 @@ namespace tensor_operation {
namespace device {
namespace instance {
static constexpr index_t RANK = 4;
void add_device_softmax_i8_i8_rank4_reduce4_instances(
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4, 4>>& instances)
{
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
add_device_operation_instances(instances, device_softmax_i8_i8_instances<4, 4>{});
}
} // namespace instance

View File

@@ -40,7 +40,11 @@ template <> std::string type_to_string<int8_t>() { return "int8"; }
template <> std::string type_to_string<int32_t>() { return "int32"; }
// clang-format on
template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
template <typename InDataType,
typename AccDataType,
typename OutDataType,
index_t Rank,
index_t NumReduceDim>
bool profile_softmax_impl(int do_verification,
int init_method,
bool do_log,
@@ -54,7 +58,13 @@ bool profile_softmax_impl(int do_verification,
if(Rank != in_length.size())
{
throw std::runtime_error("Input tensor rank is different from template argument Rank!");
}
};
if(NumReduceDim != reduce_dims.size())
{
throw std::runtime_error(
"Input reduce_dims rank is different from template argument NumReduceDim!");
};
Tensor<InDataType> in = in_strides.empty() ? Tensor<InDataType>(in_length)
: Tensor<InDataType>(in_length, in_strides);
@@ -92,8 +102,13 @@ bool profile_softmax_impl(int do_verification,
// add device softmax instances
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using DeviceOp = tensor_operation::device::
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
using DeviceOp = tensor_operation::device::DeviceSoftmax<InDataType,
AccDataType,
OutDataType,
PassThrough,
PassThrough,
Rank,
NumReduceDim>;
// get device op instances
const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory<
@@ -112,13 +127,6 @@ bool profile_softmax_impl(int do_verification,
for(auto& inst_ptr : instances)
{
// Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3
// problem to rank 4 kernel) other than invoking IsSupportedArgument()?
if(!(inst_ptr->GetNumReduceDim() == static_cast<index_t>(reduce_dims.size())))
{
continue;
}
auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths,
in_tensor_strides,
reduce_dims,

View File

@@ -92,27 +92,76 @@ int profile_softmax(int argc, char* argv[])
{
if(data_type == SoftmaxDataType::F16_F16)
{
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
if(reduce.size() == 1)
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 1>(
do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 2)
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 2>(
do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 3)
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 3, 3>(
do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else
throw std::runtime_error("invalid number of dimensions to reduce");
}
else if(data_type == SoftmaxDataType::F32_F32)
{
ck::profiler::profile_softmax_impl<float, float, float, 3>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
if(reduce.size() == 1)
ck::profiler::profile_softmax_impl<float, float, float, 3, 1>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 2)
ck::profiler::profile_softmax_impl<float, float, float, 3, 2>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 3)
ck::profiler::profile_softmax_impl<float, float, float, 3, 3>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else
throw std::runtime_error("invalid number of dimensions to reduce");
}
else
{
@@ -124,27 +173,97 @@ int profile_softmax(int argc, char* argv[])
{
if(data_type == SoftmaxDataType::F16_F16)
{
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
if(reduce.size() == 1)
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 1>(
do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 2)
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 2>(
do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 3)
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 3>(
do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 4)
ck::profiler::profile_softmax_impl<ck::half_t, float, ck::half_t, 4, 4>(
do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else
throw std::runtime_error("invalid number of dimensions to reduce");
}
else if(data_type == SoftmaxDataType::F32_F32)
{
ck::profiler::profile_softmax_impl<float, float, float, 4>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
if(reduce.size() == 1)
ck::profiler::profile_softmax_impl<float, float, float, 4, 1>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 2)
ck::profiler::profile_softmax_impl<float, float, float, 4, 2>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 3)
ck::profiler::profile_softmax_impl<float, float, float, 4, 3>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else if(reduce.size() == 4)
ck::profiler::profile_softmax_impl<float, float, float, 4, 4>(do_verification,
init_method,
do_log,
time_kernel,
length,
stride,
reduce,
double(alpha),
double(beta));
else
throw std::runtime_error("invalid number of dimensions to reduce");
}
else
{

View File

@@ -61,8 +61,92 @@ class TestSoftmax : public ::testing::Test
int init_method = 1; // integer value initialization
bool log = false;
std::vector<ck::index_t> strides; // intenionally empty, to get packed layout.
bool pass = ck::profiler::profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank>(
verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta);
bool pass = false;
if constexpr(Rank == 3)
{
if(reduce_dims.size() == 1)
pass = ck::profiler::
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
init_method,
log,
bench_,
in_length,
strides,
reduce_dims,
alpha,
beta);
else if(reduce_dims.size() == 2)
pass = ck::profiler::
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
init_method,
log,
bench_,
in_length,
strides,
reduce_dims,
alpha,
beta);
else if(reduce_dims.size() == 3)
pass = ck::profiler::
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
init_method,
log,
bench_,
in_length,
strides,
reduce_dims,
alpha,
beta);
}
else if constexpr(Rank == 4)
{
if(reduce_dims.size() == 1)
pass = ck::profiler::
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 1>(verify_,
init_method,
log,
bench_,
in_length,
strides,
reduce_dims,
alpha,
beta);
else if(reduce_dims.size() == 2)
pass = ck::profiler::
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 2>(verify_,
init_method,
log,
bench_,
in_length,
strides,
reduce_dims,
alpha,
beta);
else if(reduce_dims.size() == 3)
pass = ck::profiler::
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 3>(verify_,
init_method,
log,
bench_,
in_length,
strides,
reduce_dims,
alpha,
beta);
else if(reduce_dims.size() == 4)
pass = ck::profiler::
profile_softmax_impl<InDataType, AccDataType, OutDataType, Rank, 4>(verify_,
init_method,
log,
bench_,
in_length,
strides,
reduce_dims,
alpha,
beta);
};
EXPECT_TRUE(pass);
}