mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-18 20:09:25 +00:00
Padded Generic Kernel Instance (#730)
* Add NumReduceDim template parameter to DeviceSoftmax and Softmax client API to simplify instances collecting * Move the generic kernel instance to be the first of the instance list for elementwise op of normalization * Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax * Add testing of GetGenericInstance() in client_example of Softmax * Revert "Add testing of GetGenericInstance() in client_example of Softmax" This reverts commitf629cd9a93. * Revert "Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax" This reverts commita9f0d000eb. * Support generic kernel instance to be the first instance returned by GetInstances() for GroupNorm * Move generic kernel instance to separate tuple for elementwise op of normalization * Remove un-used files for softmax instance * Store generic kernel instance to separate tuple for softmax * Add IsSupported checking for generic instance to client example of softmax * Replace the get_device_normalize_from_mean_meansquare_instances() by the DeviceOperationInstanceFactory class for elementwise-normalization * clang-format fix * Remove int8 from softmax instances --------- Co-authored-by: zjing14 <zhangjing14@gmail.com> [ROCm/composable_kernel commit:0d9118226b]
This commit is contained in:
@@ -5,11 +5,10 @@
|
||||
|
||||
#include <vector>
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_elementwise_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
@@ -29,20 +28,34 @@ template <typename InputType,
|
||||
typename GammaDataType,
|
||||
typename BetaDataType,
|
||||
typename OutputType>
|
||||
auto get_device_normalize_from_mean_meansquare_instances()
|
||||
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceElementwise<
|
||||
ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
|
||||
ck::Tuple<OutputType>,
|
||||
Normalize,
|
||||
2>>
|
||||
{
|
||||
std::vector<DeviceNormalizeFromMeanMeanSquarePtr> op_ptrs;
|
||||
using DeviceOp = DeviceElementwise<
|
||||
ck::Tuple<InputType, MeanType, MeanSquareType, GammaDataType, BetaDataType>,
|
||||
ck::Tuple<OutputType>,
|
||||
Normalize,
|
||||
2>;
|
||||
|
||||
if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
|
||||
is_same<MeanSquareType, float>::value && is_same<GammaDataType, half_t>::value &&
|
||||
is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
|
||||
static auto GetInstances()
|
||||
{
|
||||
ck::tensor_operation::device::instance::
|
||||
add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
|
||||
}
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
|
||||
return op_ptrs;
|
||||
}
|
||||
if constexpr(is_same<InputType, half_t>::value && is_same<MeanType, float>::value &&
|
||||
is_same<MeanSquareType, float>::value &&
|
||||
is_same<GammaDataType, half_t>::value &&
|
||||
is_same<BetaDataType, half_t>::value && is_same<OutputType, half_t>::value)
|
||||
{
|
||||
ck::tensor_operation::device::instance::
|
||||
add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(op_ptrs);
|
||||
}
|
||||
|
||||
return op_ptrs;
|
||||
};
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -9,34 +9,33 @@
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>&);
|
||||
void add_device_softmax_f16_f16_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>&);
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>&);
|
||||
void add_device_softmax_f32_f32_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>&);
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>&);
|
||||
void add_device_softmax_i8_i8_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>&);
|
||||
|
||||
template <typename InDataType, typename AccDataType, typename OutDataType, index_t Rank>
|
||||
struct DeviceOperationInstanceFactory<
|
||||
ck::tensor_operation::device::
|
||||
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>>
|
||||
template <typename InDataType,
|
||||
typename AccDataType,
|
||||
typename OutDataType,
|
||||
index_t Rank,
|
||||
index_t NumReduceDim>
|
||||
struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceSoftmax<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>>
|
||||
{
|
||||
using DeviceOp =
|
||||
DeviceSoftmax<InDataType, AccDataType, OutDataType, PassThrough, PassThrough, Rank>;
|
||||
using DeviceOp = DeviceSoftmax<InDataType,
|
||||
AccDataType,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
Rank,
|
||||
NumReduceDim>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
@@ -46,25 +45,49 @@ struct DeviceOperationInstanceFactory<
|
||||
std::is_same_v<OutDataType, F16>)
|
||||
{
|
||||
if constexpr(Rank == 3)
|
||||
add_device_softmax_f16_f16_rank3_instances(op_ptrs);
|
||||
{
|
||||
if constexpr(NumReduceDim == 1)
|
||||
add_device_softmax_f16_f16_rank3_reduce1_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 2)
|
||||
add_device_softmax_f16_f16_rank3_reduce2_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 3)
|
||||
add_device_softmax_f16_f16_rank3_reduce3_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(Rank == 4)
|
||||
add_device_softmax_f16_f16_rank4_instances(op_ptrs);
|
||||
{
|
||||
if constexpr(NumReduceDim == 1)
|
||||
add_device_softmax_f16_f16_rank4_reduce1_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 2)
|
||||
add_device_softmax_f16_f16_rank4_reduce2_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 3)
|
||||
add_device_softmax_f16_f16_rank4_reduce3_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 4)
|
||||
add_device_softmax_f16_f16_rank4_reduce4_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
else if constexpr(std::is_same_v<InDataType, F32> && std::is_same_v<AccDataType, F32> &&
|
||||
std::is_same_v<OutDataType, F32>)
|
||||
{
|
||||
if constexpr(Rank == 3)
|
||||
add_device_softmax_f32_f32_rank3_instances(op_ptrs);
|
||||
{
|
||||
if constexpr(NumReduceDim == 1)
|
||||
add_device_softmax_f32_f32_rank3_reduce1_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 2)
|
||||
add_device_softmax_f32_f32_rank3_reduce2_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 3)
|
||||
add_device_softmax_f32_f32_rank3_reduce3_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(Rank == 4)
|
||||
add_device_softmax_f32_f32_rank4_instances(op_ptrs);
|
||||
}
|
||||
else if constexpr(std::is_same_v<InDataType, I8> && std::is_same_v<AccDataType, F32> &&
|
||||
std::is_same_v<OutDataType, I8>)
|
||||
{
|
||||
if constexpr(Rank == 3)
|
||||
add_device_softmax_i8_i8_rank3_instances(op_ptrs);
|
||||
else if constexpr(Rank == 4)
|
||||
add_device_softmax_i8_i8_rank4_instances(op_ptrs);
|
||||
{
|
||||
if constexpr(NumReduceDim == 1)
|
||||
add_device_softmax_f32_f32_rank4_reduce1_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 2)
|
||||
add_device_softmax_f32_f32_rank4_reduce2_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 3)
|
||||
add_device_softmax_f32_f32_rank4_reduce3_instances(op_ptrs);
|
||||
else if constexpr(NumReduceDim == 4)
|
||||
add_device_softmax_f32_f32_rank4_reduce4_instances(op_ptrs);
|
||||
}
|
||||
}
|
||||
|
||||
return op_ptrs;
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
void add_device_softmax_f16_f16_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -16,7 +16,6 @@ template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f16_f16_instances = std::tuple<
|
||||
// clang-format off
|
||||
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
|
||||
// fallback kernel
|
||||
DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>,
|
||||
DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>,
|
||||
DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>,
|
||||
@@ -33,6 +32,13 @@ using device_softmax_f16_f16_instances = std::tuple<
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f16_f16_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceSoftmaxImpl< F16, F32, F16, PassThrough, PassThrough, Rank, Reduce, 64, 8, 8, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
|
||||
void add_device_softmax_f32_f32_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -14,7 +14,7 @@ namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances);
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
|
||||
@@ -16,7 +16,7 @@ template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f32_f32_instances = std::tuple<
|
||||
// clang-format off
|
||||
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>,
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
|
||||
@@ -32,6 +32,13 @@ using device_softmax_f32_f32_instances = std::tuple<
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f32_f32_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceSoftmaxImpl< F32, F32, F32, PassThrough, PassThrough, Rank, Reduce, 64, 8, 8, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances);
|
||||
void add_device_softmax_i8_i8_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,22 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances);
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,40 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <tuple>
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_i8_i8_instances = std::tuple<
|
||||
// clang-format off
|
||||
// InDataType, AccDataType, OutDataType, InElementwiseOp, AccElementwiseOp, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
|
||||
// fallback kernel
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 1, 1>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 1, 16, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 4, 64, 1, 16, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 16, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 32, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 2, 128, 1, 64, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 16, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 32, 1, 16, 16>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 1, 256, 1, 64, 1, 16, 16>,
|
||||
// Reduction on middle dimensions
|
||||
// InSrcVectorDim is 0 since we want to coalesce reads on M dimension
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 8, 32, 8, 8, 0, 1, 1>,
|
||||
DeviceSoftmaxImpl< I8, F32, I8, PassThrough, PassThrough, Rank, Reduce, 256, 32, 8, 32, 8, 0, 16, 8>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -3,6 +3,17 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
|
||||
|
||||
Reference in New Issue
Block a user