mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-18 20:09:25 +00:00
Standalone sweep once softmax kernel w/ ckProfiler (#295)
* use 'sweep once' softmax kernel where applicable
* threadwise copy's dst buffer can specify invalid element value
* add int8 in/out float compute softmax support
give a bit of leeway for int absolute tolerance as there's a single data point of all test cases showing off-by-1 error
* format
* softmax inherits DeviceNormalization
* softmax profiler stub
* tighten up reference softmax interface
* example prints tensor dimension
* add fp32 to softmax profiler
* rename header
* hook with ckProfiler
* format
* resolve merge conflict
* resolve merge conflicts
* update normalization profiler help string
* resolve conflict
* typo
* remove residual
* softmax profiler: address feedback
* test for mixed precision input/output
* fully qualify ck::math::isnan
* add comment for device normalization interface
* revise wording
* constness for alpha/beta scaler pointer
[ROCm/composable_kernel commit: 93c99f3d87]
This commit is contained in:
@@ -25,6 +25,7 @@ add_subdirectory(conv2d_fwd_bias_relu_add)
|
||||
add_subdirectory(conv2d_bwd_data)
|
||||
add_subdirectory(convnd_bwd_data)
|
||||
add_subdirectory(conv2d_bwd_weight)
|
||||
add_subdirectory(normalization)
|
||||
add_subdirectory(reduce)
|
||||
|
||||
add_library(device_operations STATIC
|
||||
|
||||
@@ -0,0 +1,10 @@
|
||||
# device_normalization_instance
|
||||
set(DEVICE_NORMALIZATION_INSTANCE_SOURCE
|
||||
device_softmax_f32_f32_instance.cpp
|
||||
device_softmax_f16_f16_instance.cpp
|
||||
)
|
||||
|
||||
add_library(device_normalization_instance OBJECT ${DEVICE_NORMALIZATION_INSTANCE_SOURCE})
|
||||
set_target_properties(device_normalization_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
||||
clang_tidy_check(device_normalization_instance)
|
||||
@@ -0,0 +1,49 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_normalization_instance {
|
||||
|
||||
using F16 = ck::half_t;
|
||||
using F32 = float;
|
||||
|
||||
template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f16_f16_instances = std::tuple<
|
||||
// clang-format off
|
||||
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
|
||||
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
|
||||
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 8, 32, 1, 8, 1, 8, 8>,
|
||||
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 4, 64, 1, 8, 1, 8, 8>,
|
||||
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 8, 1, 8, 8>,
|
||||
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 16, 1, 8, 8>,
|
||||
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 2, 128, 1, 32, 1, 8, 8>,
|
||||
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 8, 1, 8, 8>,
|
||||
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 16, 1, 8, 8>,
|
||||
DeviceSoftmax<F16, F32, F16, Rank, Reduce, 256, 1, 256, 1, 32, 1, 8, 8>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
void add_device_softmax_f16_f16_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<3, 2>{});
|
||||
}
|
||||
|
||||
void add_device_softmax_f16_f16_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f16_f16_instances<4, 3>{});
|
||||
}
|
||||
|
||||
} // namespace device_normalization_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -0,0 +1,48 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_softmax.hpp"
|
||||
#include "ck/utility/data_type.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace device_normalization_instance {
|
||||
|
||||
using F32 = float;
|
||||
|
||||
template <index_t Rank, index_t Reduce>
|
||||
using device_softmax_f32_f32_instances = std::tuple<
|
||||
// clang-format off
|
||||
// InDataType, AccDataType, OutDataType, Rank, NumReduceDim, BlockSize, MThreadClusterSize, KThreadClusterSize, MThreadSliceSize, KThreadSliceSize, InSrcVectorDim, InSrcVectorSize, OutDstVectorSize>
|
||||
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 1, 1>, // fallback kernel
|
||||
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 8, 32, 1, 8, 1, 4, 4>,
|
||||
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 4, 64, 1, 8, 1, 4, 4>,
|
||||
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 8, 1, 4, 4>,
|
||||
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 16, 1, 4, 4>,
|
||||
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 2, 128, 1, 32, 1, 4, 4>,
|
||||
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 8, 1, 4, 4>,
|
||||
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 16, 1, 4, 4>,
|
||||
DeviceSoftmax<F32, F32, F32, Rank, Reduce, 256, 1, 256, 1, 32, 1, 4, 4>
|
||||
// clang-format on
|
||||
>;
|
||||
|
||||
void add_device_softmax_f32_f32_rank3_instances(std::vector<DeviceNormalizationPtr>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<3, 2>{});
|
||||
}
|
||||
|
||||
void add_device_softmax_f32_f32_rank4_instances(std::vector<DeviceNormalizationPtr>& instances)
|
||||
{
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 1>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 2>{});
|
||||
add_device_operation_instances(instances, device_softmax_f32_f32_instances<4, 3>{});
|
||||
}
|
||||
|
||||
} // namespace device_normalization_instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
Reference in New Issue
Block a user