mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-05-18 03:49:41 +00:00
Adding Instances and Examples for FP8-based Scaled Convolution and AMAX Reduction. (#1473)
* Enable CMakePresets build
* Verify Convolution, Scaling and ReLU algorithms.
* Add tensor element-wise scale and type cast operation.
* Reduction implemented but does not work.
* Exploration of Reduction functionality.
* Completed example for Convolution scaled with ReLu activation and AMAX reduction.
* WIP: Add required instances for convolution.
* WIP: Create client example. Implement convolution stage.
* Add elementwise instances.
* Add elementwise scale + convert example.
* Add reduction instances.
* WIP: Client example for AMAX reduction.
* WIP: Add instances for multistage reduction.
* WIP: Implementation of multistage reduction.
* Refactoring.
* Clean up.
* Add CMakePresets.json
* Guard off FP8 instances when the data type is not available.
* Add example for Scaled FP8 Convolution with AMAX reduction.
* Refactor CombConvScaleRelu instances.
* Add CombConvScale instances.
* Add client example for Scaled FP8 Convolution with AMAX reduction.
* Cleanup.
[ROCm/composable_kernel commit: c3515f277c]
This commit is contained in:
committed by
GitHub
parent
94954e9fe4
commit
10be209218
@@ -1,6 +1,6 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
# Fwd scaleadd scaleadd relu
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
|
||||
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
@@ -36,7 +36,7 @@ add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16
|
||||
grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Fwd convinvscale
|
||||
add_executable(client_conv3d_fwd_convinvscale_fp8
|
||||
add_executable(client_conv3d_fwd_convinvscale_fp8
|
||||
grouped_convnd_fwd_convinvscale/conv3d_fwd_convinvscale_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convinvscale_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Fwd convscale + Bias
|
||||
@@ -50,10 +50,18 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_ke
|
||||
# Fwd convscale + ReLU + AMAX
|
||||
add_executable(client_conv3d_fwd_convscale_relu_amax_fp8
|
||||
grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
|
||||
PRIVATE composable_kernel::device_conv_operations
|
||||
composable_kernel::device_other_operations
|
||||
composable_kernel::device_reduction_operations
|
||||
target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
|
||||
PRIVATE composable_kernel::device_conv_operations
|
||||
composable_kernel::device_other_operations
|
||||
composable_kernel::device_reduction_operations
|
||||
utility)
|
||||
# Fwd convscale + AMAX
|
||||
add_executable(client_conv3d_fwd_convscale_amax_fp8
|
||||
grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_amax_fp8
|
||||
PRIVATE composable_kernel::device_conv_operations
|
||||
composable_kernel::device_other_operations
|
||||
composable_kernel::device_reduction_operations
|
||||
utility)
|
||||
# Fwd convscale
|
||||
add_executable(client_conv3d_fwd_convscale_fp8
|
||||
@@ -64,11 +72,11 @@ add_executable(client_conv3d_fwd_convscale_bf8
|
||||
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_bf8 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_conv3d_fwd_convscale_fp8_bf8
|
||||
add_executable(client_conv3d_fwd_convscale_fp8_bf8
|
||||
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8_bf8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_fp8_bf8 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_conv3d_fwd_convscale_bf8_fp8
|
||||
add_executable(client_conv3d_fwd_convscale_bf8_fp8
|
||||
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_bf8_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Bwd data bilinear
|
||||
|
||||
@@ -15,21 +15,18 @@
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/type.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp"
|
||||
#include "ck/utility/reduction_enums.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
|
||||
namespace ew = ck::tensor_operation::element_wise;
|
||||
|
||||
using PassThrough = ew::PassThrough;
|
||||
using ConvScaleRelu = ew::UnaryCombinedOp<ew::Scale, ew::Scale, ew::Relu>;
|
||||
using ConvScale = ew::UnaryCombinedOp<ew::Scale, ew::Scale, PassThrough>;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ConvScaleRelu = ck::tensor_operation::element_wise::ScaleScaleRelu;
|
||||
using ConvScale = ck::tensor_operation::element_wise::ScaleScalePass;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
@@ -221,7 +218,9 @@ bool run_grouped_conv_fwd_convscale_reduce(
|
||||
* FP8 Convolution with Scaling
|
||||
*/
|
||||
std::cout << "\n\nConvolution with scale Benchmarking:" << std::endl;
|
||||
auto elementwise_op = ConvElementOp{ew::Scale{scale_in}, ew::Scale{scale_wei}, {}};
|
||||
auto elementwise_op = ConvElementOp{ck::tensor_operation::element_wise::Scale{scale_in},
|
||||
ck::tensor_operation::element_wise::Scale{scale_wei},
|
||||
{}};
|
||||
auto conv_ok = ConvolutionScale<InDataType,
|
||||
WeiDataType,
|
||||
ConvOutDataType,
|
||||
@@ -465,7 +464,7 @@ bool TensorScaleConvert(SimpleDeviceMem& in,
|
||||
using DeviceScaleConvert =
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<InDataType>,
|
||||
ck::Tuple<OutDataType>,
|
||||
ew::Scale,
|
||||
ck::tensor_operation::element_wise::Scale,
|
||||
NumDimSpatial + NumNonSpatialDim>;
|
||||
|
||||
// get device op instances
|
||||
@@ -483,7 +482,7 @@ bool TensorScaleConvert(SimpleDeviceMem& in,
|
||||
// profile device operation instances
|
||||
std::cout << "Run all DeviceScaleConvert instances and do timing" << std::endl;
|
||||
|
||||
auto scale_convert = ew::Scale{scale_out};
|
||||
auto scale_convert = ck::tensor_operation::element_wise::Scale{scale_out};
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::f8_t;
|
||||
using WeiDataType = ck::f8_t;
|
||||
using CShuffleDataType = float;
|
||||
using ConvOutDataType = float; // data type of convolution result
|
||||
using OutDataType = ck::f8_t; // data type of final result
|
||||
using AComputeDataType = ck::f8_t;
|
||||
using BComputeDataType = ck::f8_t;
|
||||
|
||||
using ConvElementOp = ConvScale;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd_convscale_reduce<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ConvOutDataType,
|
||||
OutDataType,
|
||||
ConvElementOp,
|
||||
ReduceOpId,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3,
|
||||
AComputeDataType,
|
||||
BComputeDataType>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
Reference in New Issue
Block a user