mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-16 10:59:55 +00:00
Padded Generic Kernel Instance (#730)
* Add NumReduceDim template parameter to DeviceSoftmax and Softmax client API to simplify instances collecting * Move the generic kernel instance to be the first of the instance list for elementwise op of normalization * Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax * Add testing of GetGenericInstance() in client_example of Softmax * Revert "Add testing of GetGenericInstance() in client_example of Softmax" This reverts commitf629cd9a93. * Revert "Add GetGenericInstance() interface for DeviceOperationInstanceFactory class of DeviceSoftmax" This reverts commita9f0d000eb. * Support generic kernel instance to be the first instance returned by GetInstances() for GroupNorm * Move generic kernel instance to separate tuple for elementwise op of normalization * Remove un-used files for softmax instance * Store generic kernel instance to separate tuple for softmax * Add IsSupported checking for generic instance to client example of softmax * Replace the get_device_normalize_from_mean_meansquare_instances() by the DeviceOperationInstanceFactory class for elementwise-normalization * clang-format fix * Remove int8 from softmax instances --------- Co-authored-by: zjing14 <zhangjing14@gmail.com> [ROCm/composable_kernel commit:0d9118226b]
This commit is contained in:
@@ -30,7 +30,12 @@ using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances = std:
|
||||
//###################|<in, mean, square_mean, gamma, beta>| <out>| functor| NDim| MPerThread| <in, mean, square_mean, gamma, beta ScalarPerVector>| <out ScalarPerVector>|
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 8, Sequence<8, 1, 1, 8, 8>, Sequence<8> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 4, Sequence<4, 1, 1, 4, 4>, Sequence<4> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >,
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 2, Sequence<2, 1, 1, 2, 2>, Sequence<2> >
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
using device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceElementwiseImpl<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2, 1, Sequence<1, 1, 1, 1, 1>, Sequence<1> >
|
||||
// clang-format on
|
||||
>;
|
||||
@@ -39,6 +44,9 @@ void add_device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances(
|
||||
std::vector<DeviceElementwisePtr<Tuple<F16, F32, F32, F16, F16>, Tuple<F16>, Normalize, 2>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_generic_instance{});
|
||||
|
||||
add_device_operation_instances(
|
||||
instances, device_normalize_from_mean_squaremean_f16_f32_f32_f16_f16_instances{});
|
||||
}
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_generic_instance<Pass, 5, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 5, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f32_generic_instance<Pass, 5, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 5, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_f32_f32_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F32, F32, F32, F16, Swish, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances, device_normalization_f16_f32_f32_f16_generic_instance<Swish, 5, 3>{});
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_f32_f32_f16_instances<Swish, 5, 3>{});
|
||||
}
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Swish, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_generic_instance<Swish, 5, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f16_instances<Swish, 5, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_5_3_swish_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Swish, 5, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f32_generic_instance<Swish, 5, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f32_instances<Swish, 5, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 2, 1>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_generic_instance<Pass, 2, 1>{});
|
||||
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 2, 1>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_2_1_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 2, 1>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f32_generic_instance<Pass, 2, 1>{});
|
||||
add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 2, 1>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f16_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F16, F16, F16, F32, F16, Pass, 4, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f16_generic_instance<Pass, 4, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f16_instances<Pass, 4, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -14,6 +14,8 @@ void add_device_normalization_rank_4_3_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceNormalization<F32, F32, F32, F32, F32, Pass, 4, 3>>>&
|
||||
instances)
|
||||
{
|
||||
add_device_operation_instances(instances,
|
||||
device_normalization_f32_generic_instance<Pass, 4, 3>{});
|
||||
add_device_operation_instances(instances, device_normalization_f32_instances<Pass, 4, 3>{});
|
||||
}
|
||||
|
||||
|
||||
@@ -43,6 +43,13 @@ using device_normalization_f16_instances =
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f16_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceNormalizationImpl<F16, F16, F16, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f32_instances = std::tuple<
|
||||
// clang-format off
|
||||
@@ -69,6 +76,13 @@ using device_normalization_f32_instances = std::tuple<
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f32_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceNormalizationImpl<F32, F32, F32, F32, F32, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f16_f32_f32_f16_instances = std::tuple<
|
||||
// clang-format off
|
||||
@@ -95,6 +109,13 @@ using device_normalization_f16_f32_f32_f16_instances = std::tuple<
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
template <typename OutElementwise, index_t Rank, index_t Reduce>
|
||||
using device_normalization_f16_f32_f32_f16_generic_instance = std::tuple<
|
||||
// clang-format off
|
||||
DeviceNormalizationImpl<F16, F32, F32, F32, F16, OutElementwise, Rank, Reduce, 64, 1, 64, 1, 1, 1, 1, 1, 1, 1, 1, 1>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -1,13 +1,4 @@
|
||||
add_instance_library(device_softmax_instance
|
||||
device_softmax_i8_i8_instance.cpp
|
||||
device_softmax_i8_i8_instance_rank3_reduce1.cpp
|
||||
device_softmax_i8_i8_instance_rank3_reduce2.cpp
|
||||
device_softmax_i8_i8_instance_rank3_reduce3.cpp
|
||||
device_softmax_i8_i8_instance_rank4_reduce1.cpp
|
||||
device_softmax_i8_i8_instance_rank4_reduce2.cpp
|
||||
device_softmax_i8_i8_instance_rank4_reduce3.cpp
|
||||
device_softmax_i8_i8_instance_rank4_reduce4.cpp
|
||||
device_softmax_f16_f16_instance.cpp
|
||||
device_softmax_f16_f16_instance_rank3_reduce1.cpp
|
||||
device_softmax_f16_f16_instance_rank3_reduce2.cpp
|
||||
device_softmax_f16_f16_instance_rank3_reduce3.cpp
|
||||
@@ -15,7 +6,6 @@ add_instance_library(device_softmax_instance
|
||||
device_softmax_f16_f16_instance_rank4_reduce2.cpp
|
||||
device_softmax_f16_f16_instance_rank4_reduce3.cpp
|
||||
device_softmax_f16_f16_instance_rank4_reduce4.cpp
|
||||
device_softmax_f32_f32_instance.cpp
|
||||
device_softmax_f32_f32_instance_rank3_reduce1.cpp
|
||||
device_softmax_f32_f32_instance_rank3_reduce2.cpp
|
||||
device_softmax_f32_f32_instance_rank3_reduce3.cpp
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank3_reduce3.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f16_f16_instance_rank4_reduce4.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3>>& instances)
|
||||
{
|
||||
add_device_softmax_f16_f16_rank3_reduce1_instances(instances);
|
||||
add_device_softmax_f16_f16_rank3_reduce2_instances(instances);
|
||||
add_device_softmax_f16_f16_rank3_reduce3_instances(instances);
|
||||
}
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4>>& instances)
|
||||
{
|
||||
add_device_softmax_f16_f16_rank4_reduce1_instances(instances);
|
||||
add_device_softmax_f16_f16_rank4_reduce2_instances(instances);
|
||||
add_device_softmax_f16_f16_rank4_reduce3_instances(instances);
|
||||
add_device_softmax_f16_f16_rank4_reduce4_instances(instances);
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 1>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 2>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 3, 3>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<3, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 1>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 2>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 3>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F16, F32, F16, PassThrough, PassThrough, 4, 4>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<RANK, 4>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_generic_instance<4, 4>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank3_reduce3.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_f32_f32_instance_rank4_reduce4.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3>>& instances)
|
||||
{
|
||||
add_device_softmax_f32_f32_rank3_reduce1_instances(instances);
|
||||
add_device_softmax_f32_f32_rank3_reduce2_instances(instances);
|
||||
add_device_softmax_f32_f32_rank3_reduce3_instances(instances);
|
||||
}
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4>>& instances)
|
||||
{
|
||||
add_device_softmax_f32_f32_rank4_reduce1_instances(instances);
|
||||
add_device_softmax_f32_f32_rank4_reduce2_instances(instances);
|
||||
add_device_softmax_f32_f32_rank4_reduce3_instances(instances);
|
||||
add_device_softmax_f32_f32_rank4_reduce4_instances(instances);
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 1>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 2>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 3, 3>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<3, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 1>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 2>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 3>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 3>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -13,12 +13,11 @@ namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, RANK>>& instances)
|
||||
std::vector<DeviceSoftmaxPtr<F32, F32, F32, PassThrough, PassThrough, 4, 4>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<RANK, 4>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_generic_instance<4, 4>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
|
||||
@@ -1,40 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 3>>& instances)
|
||||
{
|
||||
add_device_softmax_i8_i8_rank3_reduce1_instances(instances);
|
||||
add_device_softmax_i8_i8_rank3_reduce2_instances(instances);
|
||||
add_device_softmax_i8_i8_rank3_reduce3_instances(instances);
|
||||
}
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, 4>>& instances)
|
||||
{
|
||||
add_device_softmax_i8_i8_rank4_reduce1_instances(instances);
|
||||
add_device_softmax_i8_i8_rank4_reduce2_instances(instances);
|
||||
add_device_softmax_i8_i8_rank4_reduce3_instances(instances);
|
||||
add_device_softmax_i8_i8_rank4_reduce4_instances(instances);
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank3_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 3;
|
||||
|
||||
void add_device_softmax_i8_i8_rank3_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce1.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce1_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 1>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce2.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce2_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 2>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce3.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce3_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 3>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,27 +0,0 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_rank4_reduce4.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/softmax/device_softmax_i8_i8_instance_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
static constexpr index_t RANK = 4;
|
||||
|
||||
void add_device_softmax_i8_i8_rank4_reduce4_instances(
|
||||
std::vector<DeviceSoftmaxPtr<I8, F32, I8, PassThrough, PassThrough, RANK>>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_i8_i8_instances<RANK, 4>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user