diff --git a/client_example/24_grouped_conv_activation/CMakeLists.txt b/client_example/24_grouped_conv_activation/CMakeLists.txt index 60f4ee41f7..dc55250bfe 100644 --- a/client_example/24_grouped_conv_activation/CMakeLists.txt +++ b/client_example/24_grouped_conv_activation/CMakeLists.txt @@ -1,6 +1,6 @@ if(GPU_TARGETS MATCHES "gfx9") # Fwd scaleadd scaleadd relu -add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 +add_executable(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 grouped_convnd_fwd_scaleadd_scaleadd_relu/grouped_conv_fwd_scaleadd_scaleadd_relu_fp32.cpp) target_link_libraries(client_grouped_convnd_fwd_scaleadd_scaleadd_relu_fp32 PRIVATE composable_kernel::device_conv_operations) @@ -36,7 +36,7 @@ add_executable(client_grouped_convnd_fwd_bilinear_residual_fp16 grouped_convnd_fwd_bilinear/grouped_conv_fwd_bilinear_residual_fp16.cpp) target_link_libraries(client_grouped_convnd_fwd_bilinear_residual_fp16 PRIVATE composable_kernel::device_conv_operations) # Fwd convinvscale -add_executable(client_conv3d_fwd_convinvscale_fp8 +add_executable(client_conv3d_fwd_convinvscale_fp8 grouped_convnd_fwd_convinvscale/conv3d_fwd_convinvscale_fp8.cpp) target_link_libraries(client_conv3d_fwd_convinvscale_fp8 PRIVATE composable_kernel::device_conv_operations) # Fwd convscale + Bias @@ -50,10 +50,18 @@ target_link_libraries(client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_ke # Fwd convscale + ReLU + AMAX add_executable(client_conv3d_fwd_convscale_relu_amax_fp8 grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp) -target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8 - PRIVATE composable_kernel::device_conv_operations - composable_kernel::device_other_operations - composable_kernel::device_reduction_operations +target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8 + PRIVATE composable_kernel::device_conv_operations + composable_kernel::device_other_operations + composable_kernel::device_reduction_operations + utility) +# Fwd convscale + AMAX +add_executable(client_conv3d_fwd_convscale_amax_fp8 + grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp) +target_link_libraries(client_conv3d_fwd_convscale_amax_fp8 + PRIVATE composable_kernel::device_conv_operations + composable_kernel::device_other_operations + composable_kernel::device_reduction_operations utility) # Fwd convscale add_executable(client_conv3d_fwd_convscale_fp8 @@ -64,11 +72,11 @@ add_executable(client_conv3d_fwd_convscale_bf8 grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8.cpp) target_link_libraries(client_conv3d_fwd_convscale_bf8 PRIVATE composable_kernel::device_conv_operations) -add_executable(client_conv3d_fwd_convscale_fp8_bf8 +add_executable(client_conv3d_fwd_convscale_fp8_bf8 grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8_bf8.cpp) target_link_libraries(client_conv3d_fwd_convscale_fp8_bf8 PRIVATE composable_kernel::device_conv_operations) -add_executable(client_conv3d_fwd_convscale_bf8_fp8 +add_executable(client_conv3d_fwd_convscale_bf8_fp8 grouped_convnd_fwd_convscale/conv3d_fwd_convscale_bf8_fp8.cpp) target_link_libraries(client_conv3d_fwd_convscale_bf8_fp8 PRIVATE composable_kernel::device_conv_operations) # Bwd data bilinear diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp index b76c5191ec..c78cacf266 100644 --- a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp +++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp @@ -15,21 +15,18 @@ #include "ck/tensor_operation/gpu/device/device_elementwise.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" -#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" #include "ck/utility/tuple.hpp" #include "ck/utility/type.hpp" #include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp" #include "ck/utility/reduction_enums.hpp" #include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp" #include "ck/library/utility/host_tensor.hpp" -namespace ew = ck::tensor_operation::element_wise; - -using PassThrough = ew::PassThrough; -using ConvScaleRelu = ew::UnaryCombinedOp; -using ConvScale = ew::UnaryCombinedOp; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using ConvScaleRelu = ck::tensor_operation::element_wise::ScaleScaleRelu; +using ConvScale = ck::tensor_operation::element_wise::ScaleScalePass; struct SimpleDeviceMem { @@ -221,7 +218,9 @@ bool run_grouped_conv_fwd_convscale_reduce( * FP8 Convolution with Scaling */ std::cout << "\n\nConvolution with scale Benchmarking:" << std::endl; - auto elementwise_op = ConvElementOp{ew::Scale{scale_in}, ew::Scale{scale_wei}, {}}; + auto elementwise_op = ConvElementOp{ck::tensor_operation::element_wise::Scale{scale_in}, + ck::tensor_operation::element_wise::Scale{scale_wei}, + {}}; auto conv_ok = ConvolutionScale, ck::Tuple, - ew::Scale, + ck::tensor_operation::element_wise::Scale, NumDimSpatial + NumNonSpatialDim>; // get device op instances @@ -483,7 +482,7 @@ bool TensorScaleConvert(SimpleDeviceMem& in, // profile device operation instances std::cout << "Run all DeviceScaleConvert instances and do timing" << std::endl; - auto scale_convert = ew::Scale{scale_out}; + auto scale_convert = ck::tensor_operation::element_wise::Scale{scale_out}; for(int i = 0; i < op_ptrs.size(); ++i) { diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp new file mode 100644 index 0000000000..1c0299b841 --- /dev/null +++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_amax_fp8.cpp @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = ck::f8_t; +using WeiDataType = ck::f8_t; +using CShuffleDataType = float; +using ConvOutDataType = float; // data type of convolution result +using OutDataType = ck::f8_t; // data type of final result +using AComputeDataType = ck::f8_t; +using BComputeDataType = ck::f8_t; + +using ConvElementOp = ConvScale; + +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX; + +static constexpr ck::index_t NumDimSpatial = 3; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 64; +static constexpr ck::index_t K = 128; +static constexpr ck::index_t C = 64; +static constexpr ck::index_t Z = 3; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Di = 28; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 3; +static constexpr ck::index_t Do = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 3; + +int main() +{ + return run_grouped_conv_fwd_convscale_reduce( + {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt index b3c6621509..ff9020a707 100644 --- a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt +++ b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt @@ -4,7 +4,10 @@ foreach(gpu IN LISTS GPU_TARGETS) if(gpu IN_LIST gpu_list AND target EQUAL 0) add_custom_target(example_convnd_activ_xdl_convscale_reduce) add_example_executable(example_convnd_fwd_xdl_convscale_relu_amax_fp8 convnd_fwd_xdl_convscale_relu_amax_fp8.cpp) - add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8 ) + add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8) + + add_example_executable(example_convnd_fwd_xdl_convscale_amax_fp8 convnd_fwd_xdl_convscale_amax_fp8.cpp) + add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_amax_fp8) set(target 1) endif() diff --git a/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp new file mode 100644 index 0000000000..a8b4fdbead --- /dev/null +++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_amax_fp8.cpp @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_convscale_reduce_common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" + +using InDataType = ck::f8_t; +using WeiDataType = ck::f8_t; +using AccDataType = float; +using CShuffleDataType = float; +using ConvOutDataType = float; // data type of convolution result +using OutDataType = ck::f8_t; // data type of final result +using AComputeDataType = ck::f8_t; +using BComputeDataType = ck::f8_t; + +template +using S = ck::Sequence; + +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using OutElementOp = ConvScale; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +template +using DeviceGroupedConvNDFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple<>, + OutLayout, + InDataType, + WeiDataType, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + ConvOutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 32, // KPerBlock + 8, // AK1 + 8, // BK1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, + 1, + S<1, 32, 1, 8>, + 8, + AComputeDataType, + BComputeDataType>; + +#include "run_convnd_fwd_example.inc" + +int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; } diff --git a/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp b/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp index d8bac8da7a..3cc1c3c42c 100644 --- a/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp +++ b/include/ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp @@ -3,7 +3,6 @@ #pragma once -#include "ck/utility/data_type.hpp" #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" namespace ck { @@ -107,6 +106,9 @@ struct TrinaryWithUnaryCombinedOp UnaryOp2 unary_op2_{}; }; +using ScaleScalePass = UnaryCombinedOp; +using ScaleScaleRelu = UnaryCombinedOp; + } // namespace element_wise } // namespace tensor_operation } // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp index 63dcdc6053..e070b249e1 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp @@ -8,9 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" -#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" -#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" - +#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" namespace ck { @@ -177,6 +175,88 @@ struct DeviceOperationInstanceFactory, + NDHWGK, + F8, + F8, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + CombConvScale, + F8, + F8>>>& instances); +#endif + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD> +{ + using DeviceOp = DeviceGroupedConvFwdMultipleABD; + + static auto GetInstances() + { + std::vector> op_ptrs; + if constexpr(NumDimSpatial == 3 && is_same_v && + is_same_v && is_same_v) + { +#ifdef CK_ENABLE_FP8 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances( + op_ptrs); + } +#endif + } + return op_ptrs; + } +}; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp index 419f5a609a..a0651912d4 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp @@ -9,7 +9,6 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" #include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" namespace ck { @@ -100,8 +99,7 @@ struct DeviceOperationInstanceFactory< } }; -namespace ew = ck::tensor_operation::element_wise; -using CombConvScaleRelu = ew::UnaryCombinedOp; +using CombConvScaleRelu = ck::tensor_operation::element_wise::ScaleScaleRelu; #ifdef CK_ENABLE_FP8 void add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances( diff --git a/library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp b/library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp index 204c9a310d..1a70db3bf0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp @@ -47,7 +47,7 @@ using device_permute_scale_f16_instances = #if 0 // Disabled instances to improve compilation time - // They listed here to show other possible combinations of parameters + // They listed here to show other possible combinations of parameters DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 256, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 256, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 128, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, @@ -58,7 +58,7 @@ using device_permute_scale_f16_instances = DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 128, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 128, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 64, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, - + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 64, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 128, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 64, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, @@ -98,7 +98,7 @@ using device_permute_scale_f16_instances = DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 64, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 32, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 16, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>> - + >; template , ck::Tuple, ElementwiseOp, NDims, 256, 256, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 256, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 128, 256, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, @@ -143,7 +143,7 @@ using device_permute_scale_f32_instances = std::tuple< DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 128, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 128, 64, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 64, 128, 16, 16, ck::Sequence<1, 0>, ck::Sequence<16>, ck::Sequence<16>>, - + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 64, 128, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 128, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 64, 64, 4, 8, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, @@ -169,7 +169,7 @@ using device_permute_scale_f32_instances = std::tuple< DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 128, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 64, 16, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 32, 32, 8, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, -#endif +#endif DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 128, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale/CMakeLists.txt index c7f4a3527e..e20e3f49ed 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale/CMakeLists.txt @@ -3,6 +3,7 @@ set(GROUPED_CONV3D_FWD_CONVSCALE xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_bf8_instance.cpp xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_f8_bf8_instance.cpp - xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_bf8_f8_instance.cpp) + xdl/device_grouped_conv3d_fwd_xdl_convscale_ndhwgc_gkzyxc_ndhwgk_bf8_f8_instance.cpp + xdl/device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp) add_instance_library(device_grouped_conv3d_fwd_convscale_instance ${GROUPED_CONV3D_FWD_CONVSCALE}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale/xdl/device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale/xdl/device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp new file mode 100644 index 0000000000..2d387f1034 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale/xdl/device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_combconvscale_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances( + std::vector, + NDHWGK, + F8, + F8, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + CombConvScale, + F8, + F8>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, + NDHWGC, + GKZYXC, + ck::Tuple<>, + NDHWGK, + ConvFwdDefault, + CombConvScale>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, + NDHWGC, + GKZYXC, + ck::Tuple<>, + NDHWGK, + ConvFwd1x1P0, + CombConvScale>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, + NDHWGC, + GKZYXC, + ck::Tuple<>, + NDHWGK, + ConvFwd1x1S1P0, + CombConvScale>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/CMakeLists.txt index c60df5a733..8ba52adcb8 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/CMakeLists.txt @@ -1,5 +1,6 @@ # ONLY XDL_KERNELS set(GROUPED_CONV3D_FWD_CONVSCALE_RELU - xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp) + xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp + xdl/device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp) add_instance_library(device_grouped_conv3d_fwd_convscale_relu_instance ${GROUPED_CONV3D_FWD_CONVSCALE_RELU}) diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp new file mode 100644 index 0000000000..1a27e64d31 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instance.cpp @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +void add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances( + std::vector, + NDHWGK, + F8, + F8, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + CombConvScaleRelu, + F8, + F8>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, + NDHWGC, + GKZYXC, + ck::Tuple<>, + NDHWGK, + ConvFwdDefault, + CombConvScaleRelu>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, + NDHWGC, + GKZYXC, + ck::Tuple<>, + NDHWGK, + ConvFwd1x1P0, + CombConvScaleRelu>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, + NDHWGC, + GKZYXC, + ck::Tuple<>, + NDHWGK, + ConvFwd1x1S1P0, + CombConvScaleRelu>{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp index 1fda1f4ee6..91bfdda0d8 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp @@ -3,16 +3,13 @@ #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" -#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp" -#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp" namespace ck { namespace tensor_operation { namespace device { namespace instance { -using ConvScaleRelu = ck::tensor_operation::element_wise::ConvScaleRelu; - void add_device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instances( std::vector{}); } - -namespace ew = ck::tensor_operation::element_wise; -using CombConvScaleRelu = ew::UnaryCombinedOp; - -void add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances( - std::vector, - NDHWGK, - F8, - F8, - ck::Tuple<>, - F32, - PassThrough, - PassThrough, - CombConvScaleRelu, - F8, - F8>>>& instances) -{ - add_device_operation_instances( - instances, - device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, - NDHWGC, - GKZYXC, - ck::Tuple<>, - NDHWGK, - ConvFwdDefault, - CombConvScaleRelu>{}); - add_device_operation_instances( - instances, - device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, - NDHWGC, - GKZYXC, - ck::Tuple<>, - NDHWGK, - ConvFwd1x1P0, - CombConvScaleRelu>{}); - add_device_operation_instances( - instances, - device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, - NDHWGC, - GKZYXC, - ck::Tuple<>, - NDHWGK, - ConvFwd1x1S1P0, - CombConvScaleRelu>{}); -} - } // namespace instance } // namespace device } // namespace tensor_operation