diff --git a/include/ck/tensor_operation/gpu/device/device_softmax.hpp b/include/ck/tensor_operation/gpu/device/device_softmax.hpp index dc40f7c789..676e0812b7 100644 --- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp +++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp @@ -6,6 +6,7 @@ #include #include +#include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" namespace ck { diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp index 0ccac7c746..93855eb33e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp @@ -226,6 +226,30 @@ struct DeviceReduceMultiBlock in_elementwise_op_{in_elementwise_op}, acc_elementwise_op_{acc_elementwise_op} { + if(Rank != inLengths.size() || Rank != inStrides.size() || + NumReduceDim != reduceDims.size()) + { + throw std::runtime_error( + "One of inLengths/inStrides/reduceDims has invalid size!" + "\nExpected size inLengths: " + + std::to_string(Rank) + ", inStrides: " + std::to_string(Rank) + + ", reduceDims: " + std::to_string(NumReduceDim) + + "\nBut have inLengths: " + std::to_string(inLengths.size()) + + ", inStrides: " + std::to_string(inStrides.size()) + + ", reduceDims: " + std::to_string(reduceDims.size())); + } + + for(std::size_t i = 0; i < reduceDims.size(); ++i) + { + if(reduceDims[i] < 0 || reduceDims[i] >= Rank) + { + throw std::runtime_error("Provided reduce dimension exceed input tensor Rank!" + "\nHave reduceDims[" + + std::to_string(i) + + "]: " + std::to_string(reduceDims[i])); + } + } + inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp index fba820578b..8630a2c6e2 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp @@ -40,8 +40,9 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax { - static constexpr index_t kRank = Rank; - static constexpr index_t kNumReduceDim = NumReduceDim; + static constexpr index_t kRank = Rank; + static constexpr index_t kNumReduceDim = NumReduceDim; + static constexpr index_t kNumInvariantDim = Rank - NumReduceDim; virtual index_t GetRank() const override { return kRank; } @@ -168,6 +169,30 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax= Rank) + { + throw std::runtime_error("Provided reduce dimension exceed input tensor Rank!" + "\nHave reduceDims[" + + std::to_string(i) + + "]: " + std::to_string(reduceDims[i])); + } + } + inLengths_ = shuffle_tensor_dimensions(inLengths, reduceDims); inStrides_ = shuffle_tensor_dimensions(inStrides, reduceDims); @@ -257,40 +282,78 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax(p_arg); - if constexpr(InSrcVectorDim == 0) { - if constexpr(NumInvariantDim == 0) + if constexpr(kNumInvariantDim == 0) { return false; } else { - if(p_arg_->inStrides_[NumInvariantDim - 1] != 1) + if(arg.inStrides_[kNumInvariantDim - 1] != 1 && InSrcVectorSize != 1) + { return false; - - if(p_arg_->invariant_lowest_length_ % InSrcVectorSize != 0) + } + if(arg.invariant_lowest_length_ % InSrcVectorSize != 0) + { return false; - }; + } + } } else { - if(p_arg_->inStrides_[Rank - 1] != 1) + if(arg.inStrides_[Rank - 1] != 1 && InSrcVectorSize != 1) + { return false; - - if(p_arg_->inLengths_[Rank - 1] % InSrcVectorSize != 0) + } + if(arg.inLengths_[Rank - 1] % InSrcVectorSize != 0) + { return false; - }; + } + } - if(p_arg_->invariant_lowest_length_ % OutDstVectorSize != 0) + // To improve + if(kNumInvariantDim > 0 && arg.invariant_lowest_length_ % OutDstVectorSize != 0) + { return false; + } + + if(arg.inLengths_[Rank - 1] % OutDstVectorSize != 0) + { + return false; + } return true; }; + bool IsSupportedArgument(const BaseArgument* p_arg) override + { + return IsSupportedArgument(*dynamic_cast(p_arg)); + } + + static auto MakeArgument(const std::vector inLengths, + const std::vector inStrides, + const std::vector reduceDims, + const AccDataType alpha, + const AccDataType beta, + const InDataType* in_dev, + OutDataType* out_dev, + InElementwiseOp in_elementwise_op, + AccElementwiseOp acc_elementwise_op) + { + return Argument{inLengths, + inStrides, + reduceDims, + alpha, + beta, + in_dev, + out_dev, + in_elementwise_op, + acc_elementwise_op}; + }; + // // @brief Makes a pointer to Argument class. // @@ -330,6 +393,8 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax MakeInvokerPointer() override { return std::make_unique(); @@ -340,10 +405,13 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax"; + str << "DeviceReduceSoftmax<" + << Rank << "," << NumReduceDim << "," << BlockSize << "," + << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << "," + << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << "," + << "InSrcVectorDim_" << InSrcVectorDim + << "_InSrcVectorSize_" << InSrcVectorSize + << "_OutDstVectorSize_" << OutDstVectorSize << ">"; // clang-format on return str.str(); diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp index bfc6986d0c..01b851a219 100644 --- a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp @@ -60,6 +60,12 @@ struct ReferenceSoftmax : public device::BaseOperator { scalar_lengths.push_back(arg.in_.mDesc.GetLengths()[dim]); } + // max and sum reduction with final reduced values of dim=0 is a scalar so give it + // appropriate lengths of {1} + if(arg.sm_scalar_dims_.size() == 0) + { + scalar_lengths.push_back(1); + } Tensor reduce_max(scalar_lengths); reduce_max.GenerateTensorValue( @@ -67,6 +73,9 @@ struct ReferenceSoftmax : public device::BaseOperator Tensor reduce_sum(scalar_lengths); reduce_sum.GenerateTensorValue(GeneratorTensor_1{0}); + // when final reduced values is of dim=0, the index will be transformed into empty + // std::vector which is actually a valid input for Tensor::operator(std::vector) and + // internally accesses 0'th element auto to_sm_scalar_idx = [&](auto idx) { std::vector sm_scalar_idx; for(index_t dim : arg.sm_scalar_dims_) diff --git a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp index 49ba995a46..9f71af75cc 100644 --- a/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp +++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance_factory.hpp @@ -3,10 +3,10 @@ #pragma once -#include "ck/utility/data_type.hpp" -#include "ck/utility/tuple.hpp" #include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/utility/data_type.hpp" +#include "ck/utility/tuple.hpp" namespace ck { namespace tensor_operation { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp index 0ef87252e6..36eb092f0f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax.hpp @@ -8,20 +8,13 @@ #include "ck/ck.hpp" #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" -#include "ck/utility/data_type.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using F16 = ck::half_t; -using F32 = float; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; - void add_device_softmax_f16_f16_rank3_instances( std::vector>&); void add_device_softmax_f16_f16_rank4_instances( @@ -32,6 +25,11 @@ void add_device_softmax_f32_f32_rank3_instances( void add_device_softmax_f32_f32_rank4_instances( std::vector>&); +void add_device_softmax_i8_i8_rank3_instances( + std::vector>&); +void add_device_softmax_i8_i8_rank4_instances( + std::vector>&); + template struct DeviceOperationInstanceFactory< ck::tensor_operation::device:: @@ -60,6 +58,14 @@ struct DeviceOperationInstanceFactory< else if constexpr(Rank == 4) add_device_softmax_f32_f32_rank4_instances(op_ptrs); } + else if constexpr(std::is_same_v && std::is_same_v && + std::is_same_v) + { + if constexpr(Rank == 3) + add_device_softmax_i8_i8_rank3_instances(op_ptrs); + else if constexpr(Rank == 4) + add_device_softmax_i8_i8_rank4_instances(op_ptrs); + } return op_ptrs; } diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp new file mode 100644 index 0000000000..83f52fc3ee --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f16_f16_rank3_instances( + std::vector>& instances); +void add_device_softmax_f16_f16_rank4_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp new file mode 100644 index 0000000000..046ff57805 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f16_f16_rank3_reduce1_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp new file mode 100644 index 0000000000..8e6a226f6a --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f16_f16_rank3_reduce2_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp new file mode 100644 index 0000000000..518fa5f986 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f16_f16_rank3_reduce3_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp new file mode 100644 index 0000000000..10016cdd70 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f16_f16_rank4_reduce1_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp new file mode 100644 index 0000000000..cdd5a3cd7b --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f16_f16_rank4_reduce2_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp new file mode 100644 index 0000000000..a8be272e02 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f16_f16_rank4_reduce3_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp new file mode 100644 index 0000000000..ec8296ff22 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f16_f16_rank4_reduce4_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp new file mode 100644 index 0000000000..b3877c4bb3 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp @@ -0,0 +1,39 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +template +using device_softmax_f16_f16_instances = std::tuple< + // clang-format off + // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> + // fallback kernel + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>, + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8>, + // Reduction on middle dimensions + // InSrcVectorDim is 0 since we want to coalesce reads on M dimension + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 4, 0, 1, 1>, + DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 4, 0, 8, 4> + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp new file mode 100644 index 0000000000..a6d9a359f4 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f32_f32_rank3_instances( + std::vector>& instances); +void add_device_softmax_f32_f32_rank4_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp new file mode 100644 index 0000000000..6621a2c867 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f32_f32_rank3_reduce1_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp new file mode 100644 index 0000000000..3dfac98ed8 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f32_f32_rank3_reduce2_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp new file mode 100644 index 0000000000..6d2a0c9325 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f32_f32_rank3_reduce3_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp new file mode 100644 index 0000000000..97dd3dcb18 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f32_f32_rank4_reduce1_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp new file mode 100644 index 0000000000..58f8760acc --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f32_f32_rank4_reduce2_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp new file mode 100644 index 0000000000..df8d31f0da --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f32_f32_rank4_reduce3_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp new file mode 100644 index 0000000000..1bd773227e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_f32_f32_rank4_reduce4_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp new file mode 100644 index 0000000000..16f129d2d0 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +template +using device_softmax_f32_f32_instances = std::tuple< + // clang-format off + // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>, + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4>, + // Reduction on middle dimensions + // InSrcVectorDim is 0 since we want to coalesce reads on M dimension + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 4, 0, 1, 1>, + DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 4, 0, 4, 4> + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp new file mode 100644 index 0000000000..f80f712ff5 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_i8_i8_rank3_instances( + std::vector>& instances); +void add_device_softmax_i8_i8_rank4_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp new file mode 100644 index 0000000000..6f9952e7d5 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_i8_i8_rank3_reduce1_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp new file mode 100644 index 0000000000..2cbd13a1ba --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_i8_i8_rank3_reduce2_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp new file mode 100644 index 0000000000..7b12522a85 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_i8_i8_rank3_reduce3_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp new file mode 100644 index 0000000000..54d477f80c --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_i8_i8_rank4_reduce1_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp new file mode 100644 index 0000000000..4ffc44e3a9 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_i8_i8_rank4_reduce2_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp new file mode 100644 index 0000000000..08cbb81272 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_i8_i8_rank4_reduce3_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp new file mode 100644 index 0000000000..187d034b95 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_softmax.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_i8_i8_rank4_reduce4_instances( + std::vector>& instances); + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp new file mode 100644 index 0000000000..7fc9ed6919 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/ck.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +template +using device_softmax_i8_i8_instances = std::tuple< + // clang-format off + // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> + // fallback kernel + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 1, 1>, + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 16, 16>, + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 16, 1, 16, 16>, + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 16, 1, 16, 16>, + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 32, 1, 16, 16>, + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 64, 1, 16, 16>, + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 16, 1, 16, 16>, + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 32, 1, 16, 16>, + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 64, 1, 16, 16>, + // Reduction on middle dimensions + // InSrcVectorDim is 0 since we want to coalesce reads on M dimension + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 8, 0, 1, 1>, + DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 32, 8, 32, 8, 0, 16, 8> + // clang-format on + >; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp new file mode 100644 index 0000000000..03be6e2bc7 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp" diff --git a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt index 081cb23b23..fc13261a6a 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/softmax/CMakeLists.txt @@ -1,4 +1,26 @@ add_instance_library(device_softmax_instance + device_softmax_i8_i8_instance.cpp + device_softmax_i8_i8_instance_rank3_reduce1.cpp + device_softmax_i8_i8_instance_rank3_reduce2.cpp + device_softmax_i8_i8_instance_rank3_reduce3.cpp + device_softmax_i8_i8_instance_rank4_reduce1.cpp + device_softmax_i8_i8_instance_rank4_reduce2.cpp + device_softmax_i8_i8_instance_rank4_reduce3.cpp + device_softmax_i8_i8_instance_rank4_reduce4.cpp device_softmax_f16_f16_instance.cpp + device_softmax_f16_f16_instance_rank3_reduce1.cpp + device_softmax_f16_f16_instance_rank3_reduce2.cpp + device_softmax_f16_f16_instance_rank3_reduce3.cpp + device_softmax_f16_f16_instance_rank4_reduce1.cpp + device_softmax_f16_f16_instance_rank4_reduce2.cpp + device_softmax_f16_f16_instance_rank4_reduce3.cpp + device_softmax_f16_f16_instance_rank4_reduce4.cpp device_softmax_f32_f32_instance.cpp + device_softmax_f32_f32_instance_rank3_reduce1.cpp + device_softmax_f32_f32_instance_rank3_reduce2.cpp + device_softmax_f32_f32_instance_rank3_reduce3.cpp + device_softmax_f32_f32_instance_rank4_reduce1.cpp + device_softmax_f32_f32_instance_rank4_reduce2.cpp + device_softmax_f32_f32_instance_rank4_reduce3.cpp + device_softmax_f32_f32_instance_rank4_reduce4.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp index 819532e883..14d2764529 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.cpp @@ -1,55 +1,37 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. -#include #include -#include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" -#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" -#include "ck/utility/data_type.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp" + +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -namespace { -using F16 = ck::half_t; -using F32 = float; -using Pass = ck::tensor_operation::element_wise::PassThrough; -} // namespace - -template -using device_softmax_f16_f16_instances = std::tuple< - // clang-format off - // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel - DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>, - DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>, - DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>, - DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>, - DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>, - DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>, - DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>, - DeviceSoftmaxImpl< F16, F32, F16, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8> - // clang-format on - >; - void add_device_softmax_f16_f16_rank3_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{}); - add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{}); + add_device_softmax_f16_f16_rank3_reduce1_instances(instances); + add_device_softmax_f16_f16_rank3_reduce2_instances(instances); + add_device_softmax_f16_f16_rank3_reduce3_instances(instances); } void add_device_softmax_f16_f16_rank4_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{}); - add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{}); - add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{}); + add_device_softmax_f16_f16_rank4_reduce1_instances(instances); + add_device_softmax_f16_f16_rank4_reduce2_instances(instances); + add_device_softmax_f16_f16_rank4_reduce3_instances(instances); + add_device_softmax_f16_f16_rank4_reduce4_instances(instances); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp new file mode 100644 index 0000000000..fa334b997c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 3; + +void add_device_softmax_f16_f16_rank3_reduce1_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp new file mode 100644 index 0000000000..1c9d37d848 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 3; + +void add_device_softmax_f16_f16_rank3_reduce2_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp new file mode 100644 index 0000000000..5fbdab5055 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 3; + +void add_device_softmax_f16_f16_rank3_reduce3_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp new file mode 100644 index 0000000000..7dd8640b18 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_f16_f16_rank4_reduce1_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp new file mode 100644 index 0000000000..b32fe6838f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_f16_f16_rank4_reduce2_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp new file mode 100644 index 0000000000..c05048ec56 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_f16_f16_rank4_reduce3_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp new file mode 100644 index 0000000000..6a235708bd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_f16_f16_rank4_reduce4_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f16_f16_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp index cfc85986c4..e5bec5e263 100644 --- a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.cpp @@ -1,53 +1,37 @@ // SPDX-License-Identifier: MIT // Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. -#include #include -#include "ck/ck.hpp" -#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" -#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp" + +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -namespace { -using F32 = float; -using Pass = ck::tensor_operation::element_wise::PassThrough; -} // namespace - -template -using device_softmax_f32_f32_instances = std::tuple< - // clang-format off - // InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel - DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>, - DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>, - DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>, - DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>, - DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>, - DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>, - DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>, - DeviceSoftmaxImpl< F32, F32, F32, Pass, Pass, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4> - // clang-format on - >; - void add_device_softmax_f32_f32_rank3_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{}); - add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{}); + add_device_softmax_f32_f32_rank3_reduce1_instances(instances); + add_device_softmax_f32_f32_rank3_reduce2_instances(instances); + add_device_softmax_f32_f32_rank3_reduce3_instances(instances); } void add_device_softmax_f32_f32_rank4_instances( - std::vector>& instances) + std::vector>& instances) { - add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{}); - add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{}); - add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{}); + add_device_softmax_f32_f32_rank4_reduce1_instances(instances); + add_device_softmax_f32_f32_rank4_reduce2_instances(instances); + add_device_softmax_f32_f32_rank4_reduce3_instances(instances); + add_device_softmax_f32_f32_rank4_reduce4_instances(instances); } } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp new file mode 100644 index 0000000000..57d3f184a6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 3; + +void add_device_softmax_f32_f32_rank3_reduce1_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp new file mode 100644 index 0000000000..fae3a4dd66 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 3; + +void add_device_softmax_f32_f32_rank3_reduce2_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp new file mode 100644 index 0000000000..b6fb70e8e2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 3; + +void add_device_softmax_f32_f32_rank3_reduce3_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp new file mode 100644 index 0000000000..33c7b6f35f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_f32_f32_rank4_reduce1_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp new file mode 100644 index 0000000000..c22aa574b1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_f32_f32_rank4_reduce2_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp new file mode 100644 index 0000000000..55f3d2bd20 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_f32_f32_rank4_reduce3_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp new file mode 100644 index 0000000000..fb0bcf5ee8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_f32_f32_rank4_reduce4_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_f32_f32_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp new file mode 100644 index 0000000000..608cfcf838 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.cpp @@ -0,0 +1,40 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp" + +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_softmax_i8_i8_rank3_instances( + std::vector>& instances) +{ + add_device_softmax_i8_i8_rank3_reduce1_instances(instances); + add_device_softmax_i8_i8_rank3_reduce2_instances(instances); + add_device_softmax_i8_i8_rank3_reduce3_instances(instances); +} + +void add_device_softmax_i8_i8_rank4_instances( + std::vector>& instances) +{ + add_device_softmax_i8_i8_rank4_reduce1_instances(instances); + add_device_softmax_i8_i8_rank4_reduce2_instances(instances); + add_device_softmax_i8_i8_rank4_reduce3_instances(instances); + add_device_softmax_i8_i8_rank4_reduce4_instances(instances); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp new file mode 100644 index 0000000000..15552dbae5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 3; + +void add_device_softmax_i8_i8_rank3_reduce1_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp new file mode 100644 index 0000000000..6767402886 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 3; + +void add_device_softmax_i8_i8_rank3_reduce2_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp new file mode 100644 index 0000000000..4b33da93c2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 3; + +void add_device_softmax_i8_i8_rank3_reduce3_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp new file mode 100644 index 0000000000..fe3b823e88 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_i8_i8_rank4_reduce1_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp new file mode 100644 index 0000000000..8ecdf87d9f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_i8_i8_rank4_reduce2_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp new file mode 100644 index 0000000000..3563135204 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_i8_i8_rank4_reduce3_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp new file mode 100644 index 0000000000..aa21a0bf8a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include + +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr index_t RANK = 4; + +void add_device_softmax_i8_i8_rank4_reduce4_instances( + std::vector>& instances) +{ + add_device_operation_instances(instances, device_softmax_i8_i8_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/profiler/include/profile_softmax_impl.hpp b/profiler/include/profile_softmax_impl.hpp index 8394a58453..090cdaaa9a 100644 --- a/profiler/include/profile_softmax_impl.hpp +++ b/profiler/include/profile_softmax_impl.hpp @@ -3,55 +3,27 @@ #pragma once +#include #include +#include +#include +#include #include "ck/ck.hpp" #include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/convolution_parameter.hpp" #include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/fill.hpp" #include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +#include "ck/library/tensor_operation_instance/gpu/softmax.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" #include "ck/utility/data_type.hpp" -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -namespace { -using F16 = ck::half_t; -using F32 = float; -using PassThrough = ck::tensor_operation::element_wise::PassThrough; -} // namespace - -void add_device_softmax_f16_f16_rank3_instances( - std::vector>&); -void add_device_softmax_f16_f16_rank4_instances( - std::vector>&); - -void add_device_softmax_f32_f32_rank3_instances( - std::vector>&); -void add_device_softmax_f32_f32_rank4_instances( - std::vector>&); - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck - namespace ck { namespace profiler { -enum struct NormType -{ - BATCHNORM, - SOFTMAX, -}; - -enum struct NormDataType +enum struct SoftmaxDataType { F32_F32, // in, out F16_F16, @@ -60,7 +32,7 @@ enum struct NormDataType }; // clang-format off -template std::string type_to_string(); +template std::string type_to_string(); template <> std::string type_to_string() { return "f32"; } template <> std::string type_to_string() { return "f16"; } template <> std::string type_to_string() { return "bf16"; } @@ -69,7 +41,7 @@ template <> std::string type_to_string() { return "int32"; } // clang-format on template -void profile_softmax_impl(int do_verification, +bool profile_softmax_impl(int do_verification, int init_method, bool do_log, bool time_kernel, @@ -77,8 +49,7 @@ void profile_softmax_impl(int do_verification, std::vector in_strides, std::vector reduce_dims, AccDataType alpha, - AccDataType beta, - NormType norm_type) + AccDataType beta) { if(Rank != in_length.size()) { @@ -88,62 +59,46 @@ void profile_softmax_impl(int do_verification, Tensor in = in_strides.empty() ? Tensor(in_length) : Tensor(in_length, in_strides); Tensor out(in.mDesc); + Tensor prior_out(in.mDesc); switch(init_method) { - // case 0: break; - case 0: - in.GenerateTensorValue(GeneratorTensor_1{}); - out.GenerateTensorValue(GeneratorTensor_1{}); - break; + case 0: break; case 1: - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - out.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(in.begin(), in.end()); + ck::utils::FillUniformDistributionIntegerValue{-5.f, 5.f}(prior_out.begin(), + prior_out.end()); break; default: - in.GenerateTensorValue(GeneratorTensor_3{0.0, 1.0}); - out.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + ck::utils::FillUniformDistribution{0.0f, 1.0f}(in); + ck::utils::FillUniformDistribution{-0.5f, 0.5f}(prior_out); } - Tensor out_ref(out); + Tensor out_ref(prior_out); - DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); - DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); - in_dev.ToDevice(in.mData.data()); - out_dev.ToDevice(out.mData.data()); + if(do_verification) + { + using ReferenceSoftmax = + tensor_operation::host::ReferenceSoftmax; + ReferenceSoftmax{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims}); + } - std::vector i_in_lengths(in.mDesc.GetLengths().begin(), in.mDesc.GetLengths().end()); - std::vector i_in_strides(in.mDesc.GetStrides().begin(), in.mDesc.GetStrides().end()); + DeviceMem in_dev(in.GetElementSpaceSizeInBytes()); + DeviceMem out_dev(out.GetElementSpaceSizeInBytes()); + in_dev.ToDevice(in.data()); + + std::vector in_tensor_lengths(in.GetLengths().begin(), in.GetLengths().end()); + std::vector in_tensor_strides(in.GetStrides().begin(), in.GetStrides().end()); // add device softmax instances using PassThrough = ck::tensor_operation::element_wise::PassThrough; - using DeviceOpPtr = tensor_operation::device:: - DeviceSoftmaxPtr; - std::vector instances; + using DeviceOp = tensor_operation::device:: + DeviceSoftmax; - if(norm_type == NormType::SOFTMAX) - { - if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(Rank == 3) - tensor_operation::device::instance::add_device_softmax_f16_f16_rank3_instances( - instances); - else if constexpr(Rank == 4) - tensor_operation::device::instance::add_device_softmax_f16_f16_rank4_instances( - instances); - } - else if constexpr(is_same::value && is_same::value && - is_same::value) - { - if constexpr(Rank == 3) - tensor_operation::device::instance::add_device_softmax_f32_f32_rank3_instances( - instances); - else if constexpr(Rank == 4) - tensor_operation::device::instance::add_device_softmax_f32_f32_rank4_instances( - instances); - } - } + // get device op instances + const auto instances = tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + std::cout << "found " << instances.size() << " instances" << std::endl; if(instances.size() <= 0) { @@ -153,21 +108,19 @@ void profile_softmax_impl(int do_verification, std::string best_instance_name; float best_avg_time = std::numeric_limits::max(); float best_gb_per_sec = 0; - - using PassThrough = ck::tensor_operation::element_wise::PassThrough; + std::vector instance_pass; for(auto& inst_ptr : instances) { // Is this user's responsibility to check if problem mismatches kernel instance (ie. rank 3 // problem to rank 4 kernel) other than invoking IsSupportedArgument()? - if(!(inst_ptr->GetRank() == static_cast(i_in_lengths.size()) && - inst_ptr->GetNumReduceDim() == static_cast(reduce_dims.size()))) + if(!(inst_ptr->GetNumReduceDim() == static_cast(reduce_dims.size()))) { continue; } - auto argument_ptr = inst_ptr->MakeArgumentPointer(i_in_lengths, - i_in_strides, + auto argument_ptr = inst_ptr->MakeArgumentPointer(in_tensor_lengths, + in_tensor_strides, reduce_dims, &alpha, &beta, @@ -181,45 +134,42 @@ void profile_softmax_impl(int do_verification, std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; LogRange(std::cout << "input lengths = [", in_length, ", ") << "], " - << "scaler = [" << alpha << ", " << beta << "]." << std::endl; - return; + << "scaler = [" << alpha << ", " << beta << "]"; + LogRange(std::cout << ", reduce dims = [", reduce_dims, ", ") << "]." << std::endl; + instance_pass.push_back(true); + continue; } + out_dev.ToDevice(prior_out.data()); auto invoker_ptr = inst_ptr->MakeInvokerPointer(); + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); - - std::size_t num_bytes = - in.mDesc.GetElementSize() * sizeof(InDataType) + - (beta == 0.0f ? 1 : 2) * out.mDesc.GetElementSize() * sizeof(OutDataType); - - float gb_per_sec = num_bytes / 1.E6 / avg_time; - - std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " - << inst_ptr->GetTypeString() << std::endl; - - if(avg_time < best_avg_time) + if(time_kernel) { - best_instance_name = inst_ptr->GetTypeString(); - best_avg_time = avg_time; - best_gb_per_sec = gb_per_sec; + std::size_t num_bytes = + in.GetElementSize() * sizeof(InDataType) + + (beta == 0.0f ? 1 : 2) * out.GetElementSize() * sizeof(OutDataType); + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " + << inst_ptr->GetTypeString() << std::endl; + + if(avg_time < best_avg_time) + { + best_instance_name = inst_ptr->GetTypeString(); + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + } } if(do_verification) { - // TODO: factory method to dynamically switch between different reference normalizations - using ReferenceFactory = - tensor_operation::host::ReferenceSoftmax; - - ReferenceFactory{}.MakeInvoker().Run({in, out_ref, alpha, beta, reduce_dims}); - - out_dev.FromDevice(out.mData.data()); - - bool pass; + out_dev.FromDevice(out.data()); + bool pass = true; if(std::is_same::value) { - pass = ck::utils::check_err( - out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1); + pass = pass && ck::utils::check_err( + out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1); if(do_log) { LogRangeAsType(std::cout << "in : ", in.mData, ",") << std::endl; @@ -230,7 +180,7 @@ void profile_softmax_impl(int do_verification, } else { - pass = ck::utils::check_err(out.mData, out_ref.mData); + pass = pass && ck::utils::check_err(out.mData, out_ref.mData); if(do_log) { LogRangeAsType(std::cout << "in : ", in.mData, ",") << std::endl; @@ -247,16 +197,22 @@ void profile_softmax_impl(int do_verification, << "], " << "scaler = [" << alpha << ", " << beta << "]." << std::endl; } + instance_pass.push_back(pass); } } - std::cout << "Best Perf for datatype = " << type_to_string() << "_" - << type_to_string() << ", "; - LogRange(std::cout << "length = ", i_in_lengths, ",") << ", "; - LogRange(std::cout << "stride = ", i_in_strides, ",") << ", "; - LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", "; - std::cout << "alpha = " << alpha << ", " - << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec - << " GB/s, " << best_instance_name << std::endl; + if(time_kernel) + { + std::cout << "Best Perf for datatype = " << type_to_string() << "_" + << type_to_string() << ", "; + LogRange(std::cout << "length = ", in_tensor_lengths, ",") << ", "; + LogRange(std::cout << "stride = ", in_tensor_strides, ",") << ", "; + LogRange(std::cout << "reduce dims ", reduce_dims, ",") << ", "; + std::cout << "alpha = " << alpha << ", " + << "beta = " << beta << ", " << best_avg_time << " ms, " << best_gb_per_sec + << " GB/s, " << best_instance_name << std::endl; + } + return std::all_of( + std::begin(instance_pass), std::end(instance_pass), [](bool p) { return p; }); } } // namespace profiler diff --git a/profiler/src/profile_softmax.cpp b/profiler/src/profile_softmax.cpp index 622d1c5673..0cf4e2b5d5 100644 --- a/profiler/src/profile_softmax.cpp +++ b/profiler/src/profile_softmax.cpp @@ -8,14 +8,10 @@ #include "profiler/include/profile_softmax_impl.hpp" using ck::index_t; -using ck::profiler::NormDataType; -using ck::profiler::NormType; +using ck::profiler::SoftmaxDataType; struct ArgParser { - std::unordered_map norm_dict = {{"batchnorm", NormType::BATCHNORM}, - {"softmax", NormType::SOFTMAX}}; - std::unordered_map> long_opts = { {"length", {}}, {"stride", {}}, {"reduce", {}}, {"alpha", {}}, {"beta", {}}}; @@ -50,7 +46,7 @@ struct ArgParser void print_help() { - std::cout << "arg1: tensor operation (batchnorm/softmax)\n" + std::cout << "arg1: tensor operation (softmax)\n" << "arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n" << "arg3: verification (0: no; 1: yes)\n" << "arg4: initialization (0: no init; 1: integer value; 2: decimal value)\n" @@ -64,7 +60,7 @@ void print_help() << std::endl; } -int profile_normalization(int argc, char* argv[]) +int profile_softmax(int argc, char* argv[]) { if(argc <= 2) { @@ -75,12 +71,11 @@ int profile_normalization(int argc, char* argv[]) ArgParser arg_parser; // short unnamed options - const NormType norm_type = arg_parser.norm_dict[argv[1]]; - const NormDataType data_type = static_cast(std::stoi(argv[2])); - const bool do_verification = std::stoi(argv[3]); - const int init_method = std::stoi(argv[4]); - const bool do_log = std::stoi(argv[5]); - const bool time_kernel = std::stoi(argv[6]); + const SoftmaxDataType data_type = static_cast(std::stoi(argv[2])); + const bool do_verification = std::stoi(argv[3]); + const int init_method = std::stoi(argv[4]); + const bool do_log = std::stoi(argv[5]); + const bool time_kernel = std::stoi(argv[6]); // parse the long options arg_parser(argc, argv); @@ -91,9 +86,10 @@ int profile_normalization(int argc, char* argv[]) arg_parser.long_opts["alpha"].empty() ? 1 : arg_parser.long_opts["alpha"][0]; const index_t beta = arg_parser.long_opts["beta"].empty() ? 0 : arg_parser.long_opts["beta"][0]; + // Rank 3 if(length.size() == 3) { - if(data_type == NormDataType::F16_F16) + if(data_type == SoftmaxDataType::F16_F16) { ck::profiler::profile_softmax_impl(do_verification, init_method, @@ -103,10 +99,9 @@ int profile_normalization(int argc, char* argv[]) stride, reduce, float(alpha), - float(beta), - norm_type); + float(beta)); } - else if(data_type == NormDataType::F32_F32) + else if(data_type == SoftmaxDataType::F32_F32) { ck::profiler::profile_softmax_impl(do_verification, init_method, @@ -116,17 +111,17 @@ int profile_normalization(int argc, char* argv[]) stride, reduce, float(alpha), - float(beta), - norm_type); + float(beta)); } else { throw std::runtime_error("not implemented yet"); } } + // Rank 4 else if(length.size() == 4) { - if(data_type == NormDataType::F16_F16) + if(data_type == SoftmaxDataType::F16_F16) { ck::profiler::profile_softmax_impl(do_verification, init_method, @@ -136,10 +131,9 @@ int profile_normalization(int argc, char* argv[]) stride, reduce, float(alpha), - float(beta), - norm_type); + float(beta)); } - else if(data_type == NormDataType::F32_F32) + else if(data_type == SoftmaxDataType::F32_F32) { ck::profiler::profile_softmax_impl(do_verification, init_method, @@ -149,8 +143,7 @@ int profile_normalization(int argc, char* argv[]) stride, reduce, float(alpha), - float(beta), - norm_type); + float(beta)); } else { diff --git a/profiler/src/profiler.cpp b/profiler/src/profiler.cpp index a0bbf77955..c647cfe8b8 100644 --- a/profiler/src/profiler.cpp +++ b/profiler/src/profiler.cpp @@ -20,7 +20,7 @@ int profile_conv_fwd_bias_relu_add(int, char*[]); int profile_conv_bwd_data(int, char*[]); int profile_conv_bwd_weight(int, char*[]); int profile_grouped_conv_fwd(int, char*[]); -int profile_normalization(int, char*[]); +int profile_softmax(int, char*[]); int profile_layernorm(int, char*[]); int profile_groupnorm(int, char*[]); int profile_reduce(int, char*[]); @@ -45,6 +45,7 @@ static void print_helper_message() " conv_bwd_data: Convolution Backward Data\n" " conv_bwd_weight: Convolution Backward Weight\n" " grouped_conv_fwd: Grouped Convolution Forward\n" + " softmax: Softmax\n" " reduce: Reduce\n"); // clang-format on } @@ -129,9 +130,9 @@ int main(int argc, char* argv[]) { return profile_reduce(argc, argv); } - else if(strcmp(argv[1], "batchnorm") == 0 || strcmp(argv[1], "softmax") == 0) + else if(strcmp(argv[1], "softmax") == 0) { - return profile_normalization(argc, argv); + return profile_softmax(argc, argv); } else if(strcmp(argv[1], "layernorm") == 0) { diff --git a/test/softmax/CMakeLists.txt b/test/softmax/CMakeLists.txt index a7013eece1..4ba4012625 100644 --- a/test/softmax/CMakeLists.txt +++ b/test/softmax/CMakeLists.txt @@ -1,11 +1,11 @@ add_custom_target(test_softmax) -add_gtest_executable(test_softmax_fp32 test_softmax_fp32.cpp) -add_gtest_executable(test_softmax_fp16 test_softmax_fp16.cpp) -add_gtest_executable(test_softmax_int8 test_softmax_int8.cpp) -target_link_libraries(test_softmax_fp32 PRIVATE utility) -target_link_libraries(test_softmax_fp16 PRIVATE utility) -target_link_libraries(test_softmax_int8 PRIVATE utility) -add_dependencies(test_softmax test_softmax_fp32) -add_dependencies(test_softmax test_softmax_fp16) -add_dependencies(test_softmax test_softmax_int8) +add_gtest_executable(test_softmax_rank3 test_softmax_rank3.cpp) +add_gtest_executable(test_softmax_rank4 test_softmax_rank4.cpp) +add_gtest_executable(test_softmax_interface test_softmax_interface.cpp) +target_link_libraries(test_softmax_rank3 PRIVATE utility device_softmax_instance) +target_link_libraries(test_softmax_rank4 PRIVATE utility device_softmax_instance) +target_link_libraries(test_softmax_interface PRIVATE utility device_softmax_instance) +add_dependencies(test_softmax test_softmax_rank3) +add_dependencies(test_softmax test_softmax_rank4) +add_dependencies(test_softmax test_softmax_interface) diff --git a/test/softmax/test_softmax_fp16.cpp b/test/softmax/test_softmax_fp16.cpp deleted file mode 100644 index cce6a422b6..0000000000 --- a/test/softmax/test_softmax_fp16.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "gtest/gtest.h" -#include "test_softmax_util.hpp" - -template -using I = ck::Number; - -template -class TestSoftmaxFP16 : public ck::TestSoftmax -{ -}; - -// clang-format off -using KernelTypes = ::testing::Types< -// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - std::tuple, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<4>>, // mixed precision - std::tuple, I<1>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>>, - std::tuple, I<2>, I<256>, I<8>, I<32>, I<1>, I<8>, I<1>, I<8>, I<8>>, - std::tuple, I<2>, I<256>, I<4>, I<64>, I<1>, I<8>, I<1>, I<8>, I<8>>, - std::tuple, I<2>, I<256>, I<2>, I<128>, I<1>, I<8>, I<1>, I<8>, I<8>>, - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<8>, I<8>>, - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<8>, I<8>>, - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<8>, I<8>> - >; -// clang-format on -TYPED_TEST_SUITE(TestSoftmaxFP16, KernelTypes); -TYPED_TEST(TestSoftmaxFP16, Test_FP16) { this->Run(); } diff --git a/test/softmax/test_softmax_fp32.cpp b/test/softmax/test_softmax_fp32.cpp deleted file mode 100644 index 4301a5ae2f..0000000000 --- a/test/softmax/test_softmax_fp32.cpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "gtest/gtest.h" -#include "test_softmax_util.hpp" - -template -using I = ck::Number; - -template -class TestSoftmaxFP32 : public ck::TestSoftmax -{ -}; - -// clang-format off -using KernelTypes = ::testing::Types< -// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<8>>, // mixed precision - std::tuple, I<1>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>>, - std::tuple, I<2>, I<256>, I<8>, I<32>, I<1>, I<4>, I<1>, I<4>, I<4>>, - std::tuple, I<2>, I<256>, I<4>, I<64>, I<1>, I<4>, I<1>, I<4>, I<4>>, - std::tuple, I<2>, I<256>, I<2>, I<128>, I<1>, I<4>, I<1>, I<4>, I<4>>, - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<4>, I<1>, I<4>, I<4>>, - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<8>, I<1>, I<4>, I<4>>, - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<4>, I<4>> - >; -// clang-format on -TYPED_TEST_SUITE(TestSoftmaxFP32, KernelTypes); -TYPED_TEST(TestSoftmaxFP32, Test_FP32) { this->Run(); } diff --git a/test/softmax/test_softmax_int8.cpp b/test/softmax/test_softmax_int8.cpp deleted file mode 100644 index dde165295e..0000000000 --- a/test/softmax/test_softmax_int8.cpp +++ /dev/null @@ -1,30 +0,0 @@ -#include "gtest/gtest.h" -#include "test_softmax_util.hpp" - -template -using I = ck::Number; - -template -class TestSoftmaxINT8 : public ck::TestSoftmax -{ -}; - -// clang-format off -using KernelTypes = ::testing::Types< -// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize> - std::tuple, I<1>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>, - std::tuple, I<1>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>, - std::tuple, I<1>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>, - std::tuple, I<1>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>>, - std::tuple, I<2>, I<256>, I<8>, I<32>, I<1>, I<16>, I<1>, I<16>, I<16>>, - std::tuple, I<2>, I<256>, I<4>, I<64>, I<1>, I<16>, I<1>, I<16>, I<16>>, - std::tuple, I<2>, I<256>, I<2>, I<128>, I<1>, I<16>, I<1>, I<16>, I<16>>, - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<16>, I<1>, I<16>, I<16>>, - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<32>, I<1>, I<16>, I<16>>, - std::tuple, I<2>, I<256>, I<1>, I<256>, I<1>, I<64>, I<1>, I<16>, I<16>> - >; -// clang-format on -TYPED_TEST_SUITE(TestSoftmaxINT8, KernelTypes); -TYPED_TEST(TestSoftmaxINT8, Test_INT8) { this->Run(); } diff --git a/test/softmax/test_softmax_interface.cpp b/test/softmax/test_softmax_interface.cpp new file mode 100644 index 0000000000..8cac0ba0f5 --- /dev/null +++ b/test/softmax/test_softmax_interface.cpp @@ -0,0 +1,86 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include + +#include "gtest/gtest.h" +#include "test_softmax_util.hpp" + +class TestSoftmaxInterface : public ::testing::Test +{ + protected: + template + using SoftmaxInstance = + ck::DeviceSoftmaxInstanceWrapper; +}; + +TEST_F(TestSoftmaxInterface, IncorrectReduceDims) +{ + std::vector lengths{2, 128, 1536}; + std::vector strides{128 * 1536, 1536, 1}; + + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {-1})), std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {3})), std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {0, 1})), + std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, strides, {})), std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2, -1})), + std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2, 4})), + std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 2>{}.IsSupported(lengths, strides, {2})), std::runtime_error); +} + +TEST_F(TestSoftmaxInterface, IncorrectLengthsSize) +{ + std::vector lengths{128, 1536}; + std::vector strides{128 * 1536, 1536, 1}; + std::vector reduce_dims{2}; + + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({128, 1536}, strides, reduce_dims)), + std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({}, strides, reduce_dims)), + std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported({1, 8, 128, 1536}, strides, reduce_dims)), + std::runtime_error); +} + +TEST_F(TestSoftmaxInterface, IncorrectStridesSize) +{ + std::vector lengths{2, 128, 1536}; + std::vector reduce_dims{2}; + + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {1536, 1}, reduce_dims)), + std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {}, reduce_dims)), + std::runtime_error); + EXPECT_THROW((SoftmaxInstance<3, 1>{}.IsSupported(lengths, {1, 2, 3, 4}, reduce_dims)), + std::runtime_error); +} + +TEST_F(TestSoftmaxInterface, UnsupportedLengths) +{ + using SoftmaxInstance1 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 1, 256, 1, 8, 1, 8, 4>; + EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1500}, {128 * 1500, 1500, 1}, {2})); + EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 127, 1536}, {127 * 1536, 1536, 1}, {2})); + EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1537}, {128 * 1537, 1537, 1}, {2})); + + // Reduction of middle dimensions + using SoftmaxInstance2 = ck::DeviceSoftmaxInstanceWrapper<3, 3, 256, 8, 32, 8, 8, 0, 8, 4>; + EXPECT_FALSE(SoftmaxInstance2{}.IsSupported({2, 128, 1536}, {128 * 1536, 1536, 1}, {0, 1, 2})); + + // Reduction of middle dimensions + using SoftmaxInstance3 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 8, 32, 8, 8, 0, 4, 8>; + EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1536}, {128 * 1536, 1536, 1}, {2})); + EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1537}, {128 * 1537, 1537, 1}, {1})); + EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 128, 1540}, {128 * 1540, 1540, 1}, {1})); + EXPECT_FALSE(SoftmaxInstance3{}.IsSupported({2, 127, 1536}, {127 * 1536, 1536, 1}, {1})); +} + +TEST_F(TestSoftmaxInterface, UnsupportedInstance) +{ + // Instance with InSrcVectorDim = 1, can't reduce middle dims if in/out vec size != 1 + using SoftmaxInstance1 = ck::DeviceSoftmaxInstanceWrapper<3, 1, 256, 8, 32, 1, 8, 1, 8, 8>; + EXPECT_FALSE(SoftmaxInstance1{}.IsSupported({2, 128, 1024}, {128 * 1024, 1024, 1}, {0})); +} diff --git a/test/softmax/test_softmax_rank3.cpp b/test/softmax/test_softmax_rank3.cpp new file mode 100644 index 0000000000..5691ee3f6c --- /dev/null +++ b/test/softmax/test_softmax_rank3.cpp @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "gtest/gtest.h" +#include "test_softmax_util.hpp" + +template +using I = ck::Number; + +using F16 = ck::half_t; +using F32 = float; +using I8 = int8_t; + +template +class TestSoftmax : public ck::TestSoftmax +{ +}; + +// clang-format off +using KernelTypes = ::testing::Types< + // InDataType, AccDataType, OutDataType, Rank + std::tuple< F16, F32, F16, I<3>>, + std::tuple< F32, F32, F32, I<3>>, + std::tuple< I8, F32, I8, I<3>> + >; +// clang-format on + +TYPED_TEST_SUITE(TestSoftmax, KernelTypes); + +#include "test_softmax_ut_cases.inc" diff --git a/test/softmax/test_softmax_rank4.cpp b/test/softmax/test_softmax_rank4.cpp new file mode 100644 index 0000000000..f0b22df25e --- /dev/null +++ b/test/softmax/test_softmax_rank4.cpp @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "gtest/gtest.h" +#include "test_softmax_util.hpp" + +template +using I = ck::Number; + +using F16 = ck::half_t; +using F32 = float; +using I8 = int8_t; + +template +class TestSoftmax : public ck::TestSoftmax +{ +}; + +// clang-format off +using KernelTypes = ::testing::Types< + // InDataType, AccDataType, OutDataType, Rank + std::tuple< F16, F32, F16, I<4>>, + std::tuple< F32, F32, F32, I<4>>, + std::tuple< I8, F32, I8, I<4>> + >; +// clang-format on + +TYPED_TEST_SUITE(TestSoftmax, KernelTypes); + +#include "test_softmax_ut_cases.inc" diff --git a/test/softmax/test_softmax_ut_cases.inc b/test/softmax/test_softmax_ut_cases.inc new file mode 100644 index 0000000000..cf5e4d2d2d --- /dev/null +++ b/test/softmax/test_softmax_ut_cases.inc @@ -0,0 +1,60 @@ +#pragma once + +TYPED_TEST(TestSoftmax, ReduceOutermostDim) +{ + std::vector reduce_dims{this->Rank - 1}; + this->Run(reduce_dims); +} + +TYPED_TEST(TestSoftmax, ReduceMiddleDim) +{ + for(int dim = 0; dim < this->Rank - 1; ++dim) + { + std::vector reduce_dims{dim}; + this->Run(reduce_dims); + } +} + +TYPED_TEST(TestSoftmax, ReduceMultipleDimsWithOutermost) +{ + for(int dim = 0; dim < this->Rank - 1; ++dim) + { + std::vector reduce_dims{dim, this->Rank - 1}; + this->Run(reduce_dims); + } +} + +TYPED_TEST(TestSoftmax, ReduceMultipleMiddleDims) +{ + std::vector reduce_dims{0, 1}; + if(this->Rank >= 3) + { + this->Run(reduce_dims); + } + + if(this->Rank >= 4) + { + reduce_dims = std::vector{0, 2}; + this->Run(reduce_dims); + reduce_dims = std::vector{0, 1, 2}; + this->Run(reduce_dims); + } +} + +TYPED_TEST(TestSoftmax, ReduceAllDims) +{ + std::vector reduce_dims(this->Rank); + std::iota(std::begin(reduce_dims), std::end(reduce_dims), 0); + this->Run(reduce_dims); +} + +TYPED_TEST(TestSoftmax, ReduceOddLengths) +{ + this->in_lengths_ = {{3, 63, 1032}}; + if(this->Rank >= 4) + { + this->in_lengths_ = {{1, 3, 63, 1032}}; + } + this->Run({this->Rank - 1}); + this->Run({this->Rank - 2}); +} diff --git a/test/softmax/test_softmax_util.hpp b/test/softmax/test_softmax_util.hpp index c41d326222..23ac3d20e2 100644 --- a/test/softmax/test_softmax_util.hpp +++ b/test/softmax/test_softmax_util.hpp @@ -3,19 +3,17 @@ #pragma once +#include +#include +#include #include -#include #include #include "ck/ck.hpp" -#include "ck/utility/number.hpp" #include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - -#include "ck/library/utility/check_err.hpp" -#include "ck/library/utility/host_tensor.hpp" -#include "ck/library/utility/device_memory.hpp" -#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp" +#include "include/ck/utility/data_type.hpp" +#include "profiler/include/profile_softmax_impl.hpp" namespace ck { @@ -35,126 +33,110 @@ template class TestSoftmax : public ::testing::Test { protected: - using InDataType = std::tuple_element_t<0, Tuple>; - using AccDataType = std::tuple_element_t<1, Tuple>; - using OutDataType = std::tuple_element_t<2, Tuple>; - static constexpr index_t Rank = std::tuple_element_t<3, Tuple>{}.value; - static constexpr index_t NumReduceDim = std::tuple_element_t<4, Tuple>{}.value; - static constexpr index_t BlockSize = std::tuple_element_t<5, Tuple>{}.value; - static constexpr index_t MThreadClusterSize = std::tuple_element_t<6, Tuple>{}.value; - static constexpr index_t KThreadClusterSize = std::tuple_element_t<7, Tuple>{}.value; - static constexpr index_t MThreadSliceSize = std::tuple_element_t<8, Tuple>{}.value; - static constexpr index_t KThreadSliceSize = std::tuple_element_t<9, Tuple>{}.value; - static constexpr index_t InSrcVectorDim = std::tuple_element_t<10, Tuple>{}.value; - static constexpr index_t InSrcVectorSize = std::tuple_element_t<11, Tuple>{}.value; - static constexpr index_t OutDstVectorSize = std::tuple_element_t<12, Tuple>{}.value; + using InDataType = std::tuple_element_t<0, Tuple>; + using AccDataType = std::tuple_element_t<1, Tuple>; + using OutDataType = std::tuple_element_t<2, Tuple>; + static constexpr index_t Rank = std::tuple_element_t<3, Tuple>{}.value; - using ReferenceInstance = - tensor_operation::host::ReferenceSoftmax; + public: + std::vector> in_lengths_ = {{2, 128, 1024}, {4, 16, 8448}, {128, 128, 64}}; + std::vector> scales_ = {{2, 0}, {0, 2}, {2, 2}}; + bool bench_ = false; // measure kernel performance + bool verify_ = true; - using PassThrough = ck::tensor_operation::element_wise::PassThrough; - - using DeviceInstance = tensor_operation::device::DeviceSoftmaxImpl; - - TestSoftmax() : ref_instance_invoker_(ReferenceInstance{}.MakeInvoker()) {} - - void RunSingle(std::vector in_length, AccDataType alpha, AccDataType beta) + void SetUp() override { - std::vector reduce_dims(NumReduceDim); - std::iota(reduce_dims.begin(), reduce_dims.end(), Rank - NumReduceDim); - - Tensor in(in_length); - Tensor out(in_length); - - in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - out.GenerateTensorValue(GeneratorTensor_2{-5, 5}); - - Tensor out_ref(out); - - DeviceMem in_dev(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); - DeviceMem out_dev(sizeof(OutDataType) * out.mDesc.GetElementSpaceSize()); - in_dev.ToDevice(in.mData.data()); - out_dev.ToDevice(out.mData.data()); - - std::vector i_in_lengths(in.mDesc.GetLengths().begin(), - in.mDesc.GetLengths().end()); - std::vector i_in_strides(in.mDesc.GetStrides().begin(), - in.mDesc.GetStrides().end()); - - auto device_instance = DeviceInstance{}; - auto argument_ptr = device_instance.MakeArgumentPointer(i_in_lengths, - i_in_strides, - reduce_dims, - &alpha, - &beta, - in_dev.GetDeviceBuffer(), - out_dev.GetDeviceBuffer(), - PassThrough{}, - PassThrough{}); - - if(!device_instance.IsSupportedArgument(argument_ptr.get())) + if constexpr(Rank == 4) { - // std::cout << "Skipped due to unsupported argument: " - // << "input lengths = [" << serialize_range(in_length) << "], " - // << "scaler = [" << alpha << ", " << beta << "]." << std::endl; - return; - } - - auto invoker_ptr = device_instance.MakeInvokerPointer(); - invoker_ptr->Run(argument_ptr.get()); - - ref_instance_invoker_.Run({in, out_ref, alpha, beta, reduce_dims}); - - out_dev.FromDevice(out.mData.data()); - - bool pass; - - if(std::is_same::value) - { - EXPECT_TRUE(pass = ck::utils::check_err( - out.mData, out_ref.mData, "Error: Incorrect results!", 0, 1)); - } - else - { - EXPECT_TRUE(pass = ck::utils::check_err(out.mData, out_ref.mData)); - } - - if(!pass) - { - FAIL() << "Failure in input lengths = [" << serialize_range(in_length) << "], " - << "scaler = [" << alpha << ", " << beta << "]."; + in_lengths_ = std::vector>{ + {1, 2, 128, 1024}, {2, 4, 16, 8448}, {1, 128, 128, 64}}; } } - void Run() + void RunSingle(std::vector in_length, + std::vector reduce_dims, + AccDataType alpha, + AccDataType beta) { + int init_method = 1; // integer value initialization + bool log = false; + std::vector strides; // intenionally empty, to get packed layout. + bool pass = ck::profiler::profile_softmax_impl( + verify_, init_method, log, bench_, in_length, strides, reduce_dims, alpha, beta); + EXPECT_TRUE(pass); + } + + void Run(std::vector reduce_dims = {}) + { + if(reduce_dims.empty()) + { + reduce_dims.push_back(Rank - 1); + } + for(auto in_length : this->in_lengths_) { for(auto scale : this->scales_) { - this->RunSingle(in_length, scale[0], scale[1]); + this->RunSingle(in_length, reduce_dims, scale[0], scale[1]); } } } - - std::vector> in_lengths_ = { - {1, 8, 128}, {2, 128, 1024}, {3, 9, 1032}, {4, 4, 2048}, {8, 1, 8192}}; - std::vector> scales_ = {{1, 0}, {1, 1}, {0, 1}, {2, 2}}; - - typename ReferenceInstance::Invoker ref_instance_invoker_; }; + +template +struct DeviceSoftmaxInstanceWrapper +{ + using F16 = half_t; + using F32 = float; + using Pass = tensor_operation::element_wise::PassThrough; + + using InDataType = F16; + using AccDataType = F32; + using OutDataType = F16; + using InElementOp = Pass; + using AccElementOp = Pass; + + using DeviceSoftmaxInstance = tensor_operation::device::DeviceSoftmaxImpl; + + bool IsSupported(const std::vector in_lengths, + const std::vector in_strides, + const std::vector reduce_dims) const + { + auto softmax = DeviceSoftmaxInstance{}; + auto argument = softmax.MakeArgument(in_lengths, + in_strides, + reduce_dims, + 1, // alpha + 1, // beta + nullptr, // in_dev + nullptr, // in_out + Pass{}, // in elementwise op + Pass{}); // acc elementwise op + return softmax.IsSupportedArgument(argument); + } +}; + } // namespace ck