Adding Instances and Examples for FP8-based Scaled Convolution and AMAX Reduction. (#1473)

* Enable CMakePresets build

* Verify Convolution, Scaling and ReLU algorithms.

* Add tensor element-wise scale and type cast operation.

* Reduction implemented but does not work.

* Exploration of Reduction functionality.

* Completed example for Convolution scaled with ReLu activation and AMAX reduction.

* WIP: Add required instances for convolution.

* WIP: Create client example. Implement convolution stage.

* Add elementwise instances.

* Add elementwise scale + convert example.

* Add reduction instances.

* WIP: Client example for AMAX reduction.

* WIP: Add instances for multistage reduction.

* WIP: Implementation of multistage reduction.

* Refactoring.

* Clean up.

* Add CMakePresets.json

* Guard off FP8 instances when the data type is not available.

* Add example for Scaled FP8 Convolution with AMAX reduction.

* Refactor CombConvScaleRelu instances.

* Add CombConvScale instances.

* Add client example for Scaled FP8 Convolution with AMAX reduction.

* Cleanup.

[ROCm/composable_kernel commit: c3515f277c]
This commit is contained in:
Andriy Roshchenko
2024-08-21 16:22:41 -06:00
committed by GitHub
parent 94954e9fe4
commit 10be209218
14 changed files with 389 additions and 87 deletions

View File

@@ -1,6 +1,6 @@
if(GPU_TARGETS MATCHES "gfx9")
# Fwd scaleadd scaleadd relu
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp)
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations)
@@ -36,7 +36,7 @@ add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16
grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp)
target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
# Fwd convinvscale
add_executable(client_conv3d_fwd_convinvscale_fp8
add_executable(client_conv3d_fwd_convinvscale_fp8
grouped_convnd_fwd_convinvscale/conv3d_fwd_convinvscale_fp8.cpp)
target_link_libraries(client_conv3d_fwd_convinvscale_fp8 PRIVATE composable_kernel::device_conv_operations)
# Fwd convscale + Bias
@@ -50,10 +50,18 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_ke
# Fwd convscale + ReLU + AMAX
add_executable(client_conv3d_fwd_convscale_relu_amax_fp8
grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp)
target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
PRIVATE composable_kernel::device_conv_operations
composable_kernel::device_other_operations
composable_kernel::device_reduction_operations
target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
PRIVATE composable_kernel::device_conv_operations
composable_kernel::device_other_operations
composable_kernel::device_reduction_operations
utility)
# Fwd convscale + AMAX
add_executable(client_conv3d_fwd_convscale_amax_fp8
grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp)
target_link_libraries(client_conv3d_fwd_convscale_amax_fp8
PRIVATE composable_kernel::device_conv_operations
composable_kernel::device_other_operations
composable_kernel::device_reduction_operations
utility)
# Fwd convscale
add_executable(client_conv3d_fwd_convscale_fp8
@@ -64,11 +72,11 @@ add_executable(client_conv3d_fwd_convscale_bf8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8.cpp)
target_link_libraries(client_conv3d_fwd_convscale_bf8 PRIVATE composable_kernel::device_conv_operations)
add_executable(client_conv3d_fwd_convscale_fp8_bf8
add_executable(client_conv3d_fwd_convscale_fp8_bf8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8_bf8.cpp)
target_link_libraries(client_conv3d_fwd_convscale_fp8_bf8 PRIVATE composable_kernel::device_conv_operations)
add_executable(client_conv3d_fwd_convscale_bf8_fp8
add_executable(client_conv3d_fwd_convscale_bf8_fp8
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8_fp8.cpp)
target_link_libraries(client_conv3d_fwd_convscale_bf8_fp8 PRIVATE composable_kernel::device_conv_operations)
# Bwd data bilinear

View File

@@ -15,21 +15,18 @@
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/type.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp"
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp"
#include "ck/utility/reduction_enums.hpp"
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp"
#include "ck/library/utility/host_tensor.hpp"
namespace ew = ck::tensor_operation::element_wise;
using PassThrough = ew::PassThrough;
using ConvScaleRelu = ew::UnaryCombinedOp<ew::Scale, ew::Scale, ew::Relu>;
using ConvScale = ew::UnaryCombinedOp<ew::Scale, ew::Scale, PassThrough>;
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
using ConvScaleRelu = ck::tensor_operation::element_wise::ScaleScaleRelu;
using ConvScale = ck::tensor_operation::element_wise::ScaleScalePass;
struct SimpleDeviceMem
{
@@ -221,7 +218,9 @@ bool run_grouped_conv_fwd_convscale_reduce(
* FP8 Convolution with Scaling
*/
std::cout << "\n\nConvolution with scale Benchmarking:" << std::endl;
auto elementwise_op = ConvElementOp{ew::Scale{scale_in}, ew::Scale{scale_wei}, {}};
auto elementwise_op = ConvElementOp{ck::tensor_operation::element_wise::Scale{scale_in},
ck::tensor_operation::element_wise::Scale{scale_wei},
{}};
auto conv_ok = ConvolutionScale<InDataType,
WeiDataType,
ConvOutDataType,
@@ -465,7 +464,7 @@ bool TensorScaleConvert(SimpleDeviceMem& in,
using DeviceScaleConvert =
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<InDataType>,
ck::Tuple<OutDataType>,
ew::Scale,
ck::tensor_operation::element_wise::Scale,
NumDimSpatial + NumNonSpatialDim>;
// get device op instances
@@ -483,7 +482,7 @@ bool TensorScaleConvert(SimpleDeviceMem& in,
// profile device operation instances
std::cout << "Run all DeviceScaleConvert instances and do timing" << std::endl;
auto scale_convert = ew::Scale{scale_out};
auto scale_convert = ck::tensor_operation::element_wise::Scale{scale_out};
for(int i = 0; i < op_ptrs.size(); ++i)
{

View File

@@ -0,0 +1,58 @@
// SPDX-License-Identifier: MIT
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
#include "common.hpp"
#include "ck/ck.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
using InDataType = ck::f8_t;
using WeiDataType = ck::f8_t;
using CShuffleDataType = float;
using ConvOutDataType = float; // data type of convolution result
using OutDataType = ck::f8_t; // data type of final result
using AComputeDataType = ck::f8_t;
using BComputeDataType = ck::f8_t;
using ConvElementOp = ConvScale;
using InLayout = ck::tensor_layout::convolution::NDHWGC;
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX;
static constexpr ck::index_t NumDimSpatial = 3;
static constexpr ck::index_t G = 1;
static constexpr ck::index_t N = 64;
static constexpr ck::index_t K = 128;
static constexpr ck::index_t C = 64;
static constexpr ck::index_t Z = 3;
static constexpr ck::index_t Y = 3;
static constexpr ck::index_t X = 3;
static constexpr ck::index_t Di = 28;
static constexpr ck::index_t Hi = 28;
static constexpr ck::index_t Wi = 3;
static constexpr ck::index_t Do = 28;
static constexpr ck::index_t Ho = 28;
static constexpr ck::index_t Wo = 3;
int main()
{
return run_grouped_conv_fwd_convscale_reduce<NumDimSpatial,
InDataType,
WeiDataType,
ConvOutDataType,
OutDataType,
ConvElementOp,
ReduceOpId,
InLayout,
WeiLayout,
OutLayout,
3,
AComputeDataType,
BComputeDataType>(
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
? EXIT_SUCCESS
: EXIT_FAILURE;
}