mirror of
https://github.com/ROCm/composable_kernel.git
synced 2026-04-19 22:39:03 +00:00
Adding Instances and Examples for FP8-based Scaled Convolution and AMAX Reduction. (#1473)
* Enable CMakePresets build * Verify Convolution, Scaling and ReLU algorithms. * Add tensor element-wise scale and type cast operation. * Reduction implemented but does not work. * Exploration of Reduction functionality. * Completed example for Convolution scaled with ReLu activation and AMAX reduction. * WIP: Add required instances for convolution. * WIP: Create client example. Implement convolution stage. * Add elementwise instances. * Add elementwise scale + convert example. * Add reduction instances. * WIP: Client example for AMAX reduction. * WIP: Add instances for multistage reduction. * WIP: Implementation of multistage reduction. * Refactoring. * Clean up. * Add CMakePresets.json * Guard off FP8 instances when the data type is not available. * Add example for Scaled FP8 Convolution with AMAX reduction. * Refactor CombConvScaleRelu instances. * Add CombConvScale instances. * Add client example for Scaled FP8 Convolution with AMAX reduction. * Cleanup.
This commit is contained in:
committed by
GitHub
parent
e20f20efbf
commit
c3515f277c
@@ -1,6 +1,6 @@
|
||||
if(GPU_TARGETS MATCHES "gfx9")
|
||||
# Fwd scaleadd scaleadd relu
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
|
||||
add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32
|
||||
grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
@@ -36,7 +36,7 @@ add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16
|
||||
grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp)
|
||||
target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Fwd convinvscale
|
||||
add_executable(client_conv3d_fwd_convinvscale_fp8
|
||||
add_executable(client_conv3d_fwd_convinvscale_fp8
|
||||
grouped_convnd_fwd_convinvscale/conv3d_fwd_convinvscale_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convinvscale_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Fwd convscale + Bias
|
||||
@@ -50,10 +50,18 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_ke
|
||||
# Fwd convscale + ReLU + AMAX
|
||||
add_executable(client_conv3d_fwd_convscale_relu_amax_fp8
|
||||
grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
|
||||
PRIVATE composable_kernel::device_conv_operations
|
||||
composable_kernel::device_other_operations
|
||||
composable_kernel::device_reduction_operations
|
||||
target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8
|
||||
PRIVATE composable_kernel::device_conv_operations
|
||||
composable_kernel::device_other_operations
|
||||
composable_kernel::device_reduction_operations
|
||||
utility)
|
||||
# Fwd convscale + AMAX
|
||||
add_executable(client_conv3d_fwd_convscale_amax_fp8
|
||||
grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_amax_fp8
|
||||
PRIVATE composable_kernel::device_conv_operations
|
||||
composable_kernel::device_other_operations
|
||||
composable_kernel::device_reduction_operations
|
||||
utility)
|
||||
# Fwd convscale
|
||||
add_executable(client_conv3d_fwd_convscale_fp8
|
||||
@@ -64,11 +72,11 @@ add_executable(client_conv3d_fwd_convscale_bf8
|
||||
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_bf8 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_conv3d_fwd_convscale_fp8_bf8
|
||||
add_executable(client_conv3d_fwd_convscale_fp8_bf8
|
||||
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8_bf8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_fp8_bf8 PRIVATE composable_kernel::device_conv_operations)
|
||||
|
||||
add_executable(client_conv3d_fwd_convscale_bf8_fp8
|
||||
add_executable(client_conv3d_fwd_convscale_bf8_fp8
|
||||
grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8_fp8.cpp)
|
||||
target_link_libraries(client_conv3d_fwd_convscale_bf8_fp8 PRIVATE composable_kernel::device_conv_operations)
|
||||
# Bwd data bilinear
|
||||
|
||||
@@ -15,21 +15,18 @@
|
||||
#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_reduce.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
|
||||
#include "ck/utility/tuple.hpp"
|
||||
#include "ck/utility/type.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp"
|
||||
#include "ck/utility/reduction_enums.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp"
|
||||
#include "ck/library/utility/host_tensor.hpp"
|
||||
|
||||
namespace ew = ck::tensor_operation::element_wise;
|
||||
|
||||
using PassThrough = ew::PassThrough;
|
||||
using ConvScaleRelu = ew::UnaryCombinedOp<ew::Scale, ew::Scale, ew::Relu>;
|
||||
using ConvScale = ew::UnaryCombinedOp<ew::Scale, ew::Scale, PassThrough>;
|
||||
using PassThrough = ck::tensor_operation::element_wise::PassThrough;
|
||||
using ConvScaleRelu = ck::tensor_operation::element_wise::ScaleScaleRelu;
|
||||
using ConvScale = ck::tensor_operation::element_wise::ScaleScalePass;
|
||||
|
||||
struct SimpleDeviceMem
|
||||
{
|
||||
@@ -221,7 +218,9 @@ bool run_grouped_conv_fwd_convscale_reduce(
|
||||
* FP8 Convolution with Scaling
|
||||
*/
|
||||
std::cout << "\n\nConvolution with scale Benchmarking:" << std::endl;
|
||||
auto elementwise_op = ConvElementOp{ew::Scale{scale_in}, ew::Scale{scale_wei}, {}};
|
||||
auto elementwise_op = ConvElementOp{ck::tensor_operation::element_wise::Scale{scale_in},
|
||||
ck::tensor_operation::element_wise::Scale{scale_wei},
|
||||
{}};
|
||||
auto conv_ok = ConvolutionScale<InDataType,
|
||||
WeiDataType,
|
||||
ConvOutDataType,
|
||||
@@ -465,7 +464,7 @@ bool TensorScaleConvert(SimpleDeviceMem& in,
|
||||
using DeviceScaleConvert =
|
||||
ck::tensor_operation::device::DeviceElementwise<ck::Tuple<InDataType>,
|
||||
ck::Tuple<OutDataType>,
|
||||
ew::Scale,
|
||||
ck::tensor_operation::element_wise::Scale,
|
||||
NumDimSpatial + NumNonSpatialDim>;
|
||||
|
||||
// get device op instances
|
||||
@@ -483,7 +482,7 @@ bool TensorScaleConvert(SimpleDeviceMem& in,
|
||||
// profile device operation instances
|
||||
std::cout << "Run all DeviceScaleConvert instances and do timing" << std::endl;
|
||||
|
||||
auto scale_convert = ew::Scale{scale_out};
|
||||
auto scale_convert = ck::tensor_operation::element_wise::Scale{scale_out};
|
||||
|
||||
for(int i = 0; i < op_ptrs.size(); ++i)
|
||||
{
|
||||
|
||||
@@ -0,0 +1,58 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "common.hpp"
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
|
||||
using InDataType = ck::f8_t;
|
||||
using WeiDataType = ck::f8_t;
|
||||
using CShuffleDataType = float;
|
||||
using ConvOutDataType = float; // data type of convolution result
|
||||
using OutDataType = ck::f8_t; // data type of final result
|
||||
using AComputeDataType = ck::f8_t;
|
||||
using BComputeDataType = ck::f8_t;
|
||||
|
||||
using ConvElementOp = ConvScale;
|
||||
|
||||
using InLayout = ck::tensor_layout::convolution::NDHWGC;
|
||||
using WeiLayout = ck::tensor_layout::convolution::GKZYXC;
|
||||
using OutLayout = ck::tensor_layout::convolution::NDHWGK;
|
||||
|
||||
constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX;
|
||||
|
||||
static constexpr ck::index_t NumDimSpatial = 3;
|
||||
static constexpr ck::index_t G = 1;
|
||||
static constexpr ck::index_t N = 64;
|
||||
static constexpr ck::index_t K = 128;
|
||||
static constexpr ck::index_t C = 64;
|
||||
static constexpr ck::index_t Z = 3;
|
||||
static constexpr ck::index_t Y = 3;
|
||||
static constexpr ck::index_t X = 3;
|
||||
static constexpr ck::index_t Di = 28;
|
||||
static constexpr ck::index_t Hi = 28;
|
||||
static constexpr ck::index_t Wi = 3;
|
||||
static constexpr ck::index_t Do = 28;
|
||||
static constexpr ck::index_t Ho = 28;
|
||||
static constexpr ck::index_t Wo = 3;
|
||||
|
||||
int main()
|
||||
{
|
||||
return run_grouped_conv_fwd_convscale_reduce<NumDimSpatial,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
ConvOutDataType,
|
||||
OutDataType,
|
||||
ConvElementOp,
|
||||
ReduceOpId,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
OutLayout,
|
||||
3,
|
||||
AComputeDataType,
|
||||
BComputeDataType>(
|
||||
{N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K})
|
||||
? EXIT_SUCCESS
|
||||
: EXIT_FAILURE;
|
||||
}
|
||||
@@ -4,7 +4,10 @@ foreach(gpu IN LISTS GPU_TARGETS)
|
||||
if(gpu IN_LIST gpu_list AND target EQUAL 0)
|
||||
add_custom_target(example_convnd_activ_xdl_convscale_reduce)
|
||||
add_example_executable(example_convnd_fwd_xdl_convscale_relu_amax_fp8 convnd_fwd_xdl_convscale_relu_amax_fp8.cpp)
|
||||
add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8 )
|
||||
add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8)
|
||||
|
||||
add_example_executable(example_convnd_fwd_xdl_convscale_amax_fp8 convnd_fwd_xdl_convscale_amax_fp8.cpp)
|
||||
add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_amax_fp8)
|
||||
|
||||
set(target 1)
|
||||
endif()
|
||||
|
||||
@@ -0,0 +1,82 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "convnd_fwd_convscale_reduce_common.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
|
||||
|
||||
using InDataType = ck::f8_t;
|
||||
using WeiDataType = ck::f8_t;
|
||||
using AccDataType = float;
|
||||
using CShuffleDataType = float;
|
||||
using ConvOutDataType = float; // data type of convolution result
|
||||
using OutDataType = ck::f8_t; // data type of final result
|
||||
using AComputeDataType = ck::f8_t;
|
||||
using BComputeDataType = ck::f8_t;
|
||||
|
||||
template <ck::index_t... Is>
|
||||
using S = ck::Sequence<Is...>;
|
||||
|
||||
using InElementOp = PassThrough;
|
||||
using WeiElementOp = PassThrough;
|
||||
using OutElementOp = ConvScale;
|
||||
|
||||
static constexpr auto ConvSpec =
|
||||
ck::tensor_operation::device::ConvolutionForwardSpecialization::Default;
|
||||
|
||||
static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding;
|
||||
|
||||
template <ck::index_t NDimSpatial, typename InLayout, typename WeiLayout, typename OutLayout>
|
||||
using DeviceGroupedConvNDFwdInstance =
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle<
|
||||
NDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
ck::Tuple<>,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
AccDataType,
|
||||
CShuffleDataType,
|
||||
ck::Tuple<>,
|
||||
ConvOutDataType,
|
||||
InElementOp,
|
||||
WeiElementOp,
|
||||
OutElementOp,
|
||||
ConvSpec, // ConvForwardSpecialization
|
||||
GemmSpec, // GemmSpecialization
|
||||
1, //
|
||||
256, // BlockSize
|
||||
128, // MPerBlock
|
||||
256, // NPerBlock
|
||||
32, // KPerBlock
|
||||
8, // AK1
|
||||
8, // BK1
|
||||
32, // MPerXdl
|
||||
32, // NPerXdl
|
||||
2, // MXdlPerWave
|
||||
4, // NXdlPerWave
|
||||
S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1
|
||||
S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // ABlockTransferSrcAccessOrder
|
||||
2, // ABlockTransferSrcVectorDim
|
||||
8, // ABlockTransferSrcScalarPerVector
|
||||
8, // ABlockTransferDstScalarPerVector_AK1
|
||||
1, // ABlockLdsExtraM
|
||||
S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1
|
||||
S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder
|
||||
S<1, 0, 2>, // BBlockTransferSrcAccessOrder
|
||||
2, // BBlockTransferSrcVectorDim
|
||||
8, // BBlockTransferSrcScalarPerVector
|
||||
8, // BBlockTransferDstScalarPerVector_BK1
|
||||
1, // BBlockLdsExtraN
|
||||
1,
|
||||
1,
|
||||
S<1, 32, 1, 8>,
|
||||
8,
|
||||
AComputeDataType,
|
||||
BComputeDataType>;
|
||||
|
||||
#include "run_convnd_fwd_example.inc"
|
||||
|
||||
int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; }
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "ck/utility/data_type.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -107,6 +106,9 @@ struct TrinaryWithUnaryCombinedOp
|
||||
UnaryOp2 unary_op2_{};
|
||||
};
|
||||
|
||||
using ScaleScalePass = UnaryCombinedOp<Scale, Scale, PassThrough>;
|
||||
using ScaleScaleRelu = UnaryCombinedOp<Scale, Scale, Relu>;
|
||||
|
||||
} // namespace element_wise
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
|
||||
@@ -8,9 +8,7 @@
|
||||
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
|
||||
|
||||
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -177,6 +175,88 @@ struct DeviceOperationInstanceFactory<ck::tensor_operation::device::DeviceGroupe
|
||||
}
|
||||
};
|
||||
|
||||
using CombConvScale = ck::tensor_operation::element_wise::ScaleScalePass;
|
||||
|
||||
#ifdef CK_ENABLE_FP8
|
||||
void add_device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
F8,
|
||||
F8,
|
||||
ck::Tuple<>,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
CombConvScale,
|
||||
F8,
|
||||
F8>>>& instances);
|
||||
#endif
|
||||
|
||||
template <ck::index_t NumDimSpatial,
|
||||
typename InLayout,
|
||||
typename WeiLayout,
|
||||
typename DLayouts,
|
||||
typename OutLayout,
|
||||
typename InDataType,
|
||||
typename WeiDataType,
|
||||
typename DDataTypes,
|
||||
typename OutDataType,
|
||||
typename AComputeType,
|
||||
typename BComputeType>
|
||||
struct DeviceOperationInstanceFactory<
|
||||
ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
DLayouts,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
DDataTypes,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
CombConvScale,
|
||||
AComputeType,
|
||||
BComputeType>>
|
||||
{
|
||||
using DeviceOp = DeviceGroupedConvFwdMultipleABD<NumDimSpatial,
|
||||
InLayout,
|
||||
WeiLayout,
|
||||
DLayouts,
|
||||
OutLayout,
|
||||
InDataType,
|
||||
WeiDataType,
|
||||
DDataTypes,
|
||||
OutDataType,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
CombConvScale,
|
||||
AComputeType,
|
||||
BComputeType>;
|
||||
|
||||
static auto GetInstances()
|
||||
{
|
||||
std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
|
||||
if constexpr(NumDimSpatial == 3 && is_same_v<InLayout, NDHWGC> &&
|
||||
is_same_v<WeiLayout, GKZYXC> && is_same_v<OutLayout, NDHWGK>)
|
||||
{
|
||||
#ifdef CK_ENABLE_FP8
|
||||
if constexpr(is_same_v<InDataType, f8_t> && is_same_v<WeiDataType, f8_t> &&
|
||||
is_same_v<OutDataType, F32> && is_same_v<AComputeType, f8_t> &&
|
||||
is_same_v<BComputeType, f8_t>)
|
||||
{
|
||||
add_device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances(
|
||||
op_ptrs);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
return op_ptrs;
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
@@ -9,7 +9,6 @@
|
||||
#include "ck/ck.hpp"
|
||||
#include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
|
||||
|
||||
namespace ck {
|
||||
@@ -100,8 +99,7 @@ struct DeviceOperationInstanceFactory<
|
||||
}
|
||||
};
|
||||
|
||||
namespace ew = ck::tensor_operation::element_wise;
|
||||
using CombConvScaleRelu = ew::UnaryCombinedOp<ew::Scale, ew::Scale, ew::Relu>;
|
||||
using CombConvScaleRelu = ck::tensor_operation::element_wise::ScaleScaleRelu;
|
||||
|
||||
#ifdef CK_ENABLE_FP8
|
||||
void add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances(
|
||||
|
||||
@@ -47,7 +47,7 @@ using device_permute_scale_f16_instances =
|
||||
|
||||
#if 0
|
||||
// Disabled instances to improve compilation time
|
||||
// They listed here to show other possible combinations of parameters
|
||||
// They listed here to show other possible combinations of parameters
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 256, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 256, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 128, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
@@ -58,7 +58,7 @@ using device_permute_scale_f16_instances =
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 128, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 128, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 64, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
|
||||
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 64, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 256, 128, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 128, 64, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
@@ -98,7 +98,7 @@ using device_permute_scale_f16_instances =
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 64, 64, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 32, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F16>, ck::Tuple<F16>, ElementwiseOp, NDims, 32, 16, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>
|
||||
|
||||
|
||||
>;
|
||||
|
||||
template <index_t NDims,
|
||||
@@ -132,7 +132,7 @@ using device_permute_scale_f32_instances = std::tuple<
|
||||
|
||||
#if 0
|
||||
// Disabled instances to improve compilation time
|
||||
// They listed here to show other possible combinations of parameters
|
||||
// They listed here to show other possible combinations of parameters
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 256, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 256, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 128, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
@@ -143,7 +143,7 @@ using device_permute_scale_f32_instances = std::tuple<
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 128, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 128, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 64, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>,
|
||||
|
||||
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 64, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 128, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 128, 64, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
@@ -169,7 +169,7 @@ using device_permute_scale_f32_instances = std::tuple<
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 64, 128, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 64, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 32, 32, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>,
|
||||
#endif
|
||||
#endif
|
||||
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
DeviceElementwiseImpl<ck::Tuple<F32>, ck::Tuple<F32>, ElementwiseOp, NDims, 256, 128, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>,
|
||||
|
||||
@@ -3,6 +3,7 @@ set(GROUPED_CONV3D_FWD_CONVSCALE
|
||||
xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp
|
||||
xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_bf8_instance.cpp
|
||||
xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_f8_bf8_instance.cpp
|
||||
xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_bf8_f8_instance.cpp)
|
||||
xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_bf8_f8_instance.cpp
|
||||
xdl/device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp)
|
||||
|
||||
add_instance_library(device_grouped_conv3d_fwd_convscale_instance ${GROUPED_CONV3D_FWD_CONVSCALE})
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
F8,
|
||||
F8,
|
||||
ck::Tuple<>,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
CombConvScale,
|
||||
F8,
|
||||
F8>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
ConvFwdDefault,
|
||||
CombConvScale>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1P0,
|
||||
CombConvScale>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1S1P0,
|
||||
CombConvScale>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -1,5 +1,6 @@
|
||||
# ONLY XDL_KERNELS
|
||||
set(GROUPED_CONV3D_FWD_CONVSCALE_RELU
|
||||
xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp)
|
||||
xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp
|
||||
xdl/device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp)
|
||||
|
||||
add_instance_library(device_grouped_conv3d_fwd_convscale_relu_instance ${GROUPED_CONV3D_FWD_CONVSCALE_RELU})
|
||||
|
||||
@@ -0,0 +1,61 @@
|
||||
// SPDX-License-Identifier: MIT
|
||||
// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
void add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
F8,
|
||||
F8,
|
||||
ck::Tuple<>,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
CombConvScaleRelu,
|
||||
F8,
|
||||
F8>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
ConvFwdDefault,
|
||||
CombConvScaleRelu>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1P0,
|
||||
CombConvScaleRelu>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1S1P0,
|
||||
CombConvScaleRelu>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
} // namespace ck
|
||||
@@ -3,16 +3,13 @@
|
||||
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp"
|
||||
#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp"
|
||||
#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp"
|
||||
#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp"
|
||||
|
||||
namespace ck {
|
||||
namespace tensor_operation {
|
||||
namespace device {
|
||||
namespace instance {
|
||||
|
||||
using ConvScaleRelu = ck::tensor_operation::element_wise::ConvScaleRelu;
|
||||
|
||||
void add_device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
@@ -57,55 +54,6 @@ void add_device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_in
|
||||
ConvFwd1x1S1P0,
|
||||
ConvScaleRelu>{});
|
||||
}
|
||||
|
||||
namespace ew = ck::tensor_operation::element_wise;
|
||||
using CombConvScaleRelu = ew::UnaryCombinedOp<ew::Scale, ew::Scale, ew::Relu>;
|
||||
|
||||
void add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances(
|
||||
std::vector<std::unique_ptr<DeviceGroupedConvFwdMultipleABD<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
F8,
|
||||
F8,
|
||||
ck::Tuple<>,
|
||||
F32,
|
||||
PassThrough,
|
||||
PassThrough,
|
||||
CombConvScaleRelu,
|
||||
F8,
|
||||
F8>>>& instances)
|
||||
{
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
ConvFwdDefault,
|
||||
CombConvScaleRelu>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1P0,
|
||||
CombConvScaleRelu>{});
|
||||
add_device_operation_instances(
|
||||
instances,
|
||||
device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3,
|
||||
NDHWGC,
|
||||
GKZYXC,
|
||||
ck::Tuple<>,
|
||||
NDHWGK,
|
||||
ConvFwd1x1S1P0,
|
||||
CombConvScaleRelu>{});
|
||||
}
|
||||
|
||||
} // namespace instance
|
||||
} // namespace device
|
||||
} // namespace tensor_operation
|
||||
|
||||
Reference in New Issue
Block a user