From 10edb0c70ea1f6923dc1d6e9f80629a64f906209 Mon Sep 17 00:00:00 2001 From: Andriy Roshchenko <107577548+andriy-ca@users.noreply.github.com> Date: Tue, 20 Aug 2024 09:30:56 -0600 Subject: [PATCH] Adding Instances and Examples for FP8-based Scaled Convolution with ReLU Activation and AMAX Reduction. (#1469) * Enable CMakePresets build * Verify Convolution, Scaling and ReLU algorithms. * Add tensor element-wise scale and type cast operation. * Reduction implemented but does not work. * Exploration of Reduction functionality. * Completed example for Convolution scaled with ReLu activation and AMAX reduction. * WIP: Add required instances for convolution. * WIP: Create client example. Implement convolution stage. * Add elementwise instances. * Add elementwise scale + convert example. * Add reduction instances. * WIP: Client example for AMAX reduction. * WIP: Add instances for multistage reduction. * WIP: Implementation of multistage reduction. * Refactoring. * Clean up. * Guard off FP8 instances when the data type is not available. * Improve output readability. * Addressing reviewer's comments. [ROCm/composable_kernel commit: a94113a9418bb2333c72ff28b7a1fc44283fddba] --- .../24_grouped_conv_activation/CMakeLists.txt | 8 + .../common.hpp | 835 ++++++++++++++++++ .../conv3d_fwd_convscale_relu_amax_fp8.cpp | 58 ++ example/62_convnd_activ/CMakeLists.txt | 1 + .../convscale_reduce/CMakeLists.txt | 11 + .../convnd_fwd_convscale_reduce_common.hpp | 502 +++++++++++ ...convnd_fwd_xdl_convscale_relu_amax_fp8.cpp | 82 ++ .../run_convnd_fwd_example.inc | 98 ++ ...ped_conv_fwd_xdl_outelementop_instance.hpp | 37 + ...ped_convolution_forward_convscale_relu.hpp | 84 ++ .../gpu/permute_scale.hpp | 13 + .../device_permute_scale_instances.hpp | 46 + ...ce_instance_blockwise_f32_f32_f32_amax.hpp | 27 +- ..._relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp | 49 + .../gpu/permute_scale/CMakeLists.txt | 5 +- ...ce_permute_scale_6d_fp32_fp8_instances.cpp | 28 + ...ce_instance_blockwise_f32_f32_f32_amax.cpp | 27 +- 17 files changed, 1891 insertions(+), 20 deletions(-) create mode 100644 client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp create mode 100644 client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp create mode 100644 example/62_convnd_activ/convscale_reduce/CMakeLists.txt create mode 100644 example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp create mode 100644 example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp create mode 100644 example/62_convnd_activ/convscale_reduce/run_convnd_fwd_example.inc create mode 100644 library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_fp8_instances.cpp diff --git a/client_example/24_grouped_conv_activation/CMakeLists.txt b/client_example/24_grouped_conv_activation/CMakeLists.txt index ae9b33b94e..60f4ee41f7 100644 --- a/client_example/24_grouped_conv_activation/CMakeLists.txt +++ b/client_example/24_grouped_conv_activation/CMakeLists.txt @@ -47,6 +47,14 @@ target_link_libraries(client_conv3d_fwd_convscale_add_fp8 PRIVATE composable_ker add_executable(client_conv3d_fwd_convscale_relu_fp8 grouped_convnd_fwd_convscale_relu/conv3d_fwd_convscale_relu_fp8.cpp) target_link_libraries(client_conv3d_fwd_convscale_relu_fp8 PRIVATE composable_kernel::device_conv_operations) +# Fwd convscale + ReLU + AMAX +add_executable(client_conv3d_fwd_convscale_relu_amax_fp8 + grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp) +target_link_libraries(client_conv3d_fwd_convscale_relu_amax_fp8 + PRIVATE composable_kernel::device_conv_operations + composable_kernel::device_other_operations + composable_kernel::device_reduction_operations + utility) # Fwd convscale add_executable(client_conv3d_fwd_convscale_fp8 grouped_convnd_fwd_convscale/conv3d_fwd_convscale_fp8.cpp) diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp new file mode 100644 index 0000000000..b76c5191ec --- /dev/null +++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/common.hpp @@ -0,0 +1,835 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/library/utility/algorithm.hpp" +#include "ck/tensor_operation/gpu/device/device_elementwise.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" +#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" +#include "ck/utility/tuple.hpp" +#include "ck/utility/type.hpp" +#include "ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/permute_scale.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp" +#include "ck/library/utility/host_tensor.hpp" + +namespace ew = ck::tensor_operation::element_wise; + +using PassThrough = ew::PassThrough; +using ConvScaleRelu = ew::UnaryCombinedOp; +using ConvScale = ew::UnaryCombinedOp; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +template +std::size_t +GetFlops(const std::array& output_lengths, + const std::array& weights_lengths, + const std::size_t& ds_size) +{ + // 2 * G * N * K * C * * + + // + ds_size * => + // => * ( 2 * C * + ds_size) => + // => G * N * K * * (2 * C * + + // ds_size) + ck::index_t G = weights_lengths[0]; + ck::index_t N = output_lengths[1]; + ck::index_t K = weights_lengths[1]; + ck::index_t C = weights_lengths[2]; + + return G * N * K * + std::accumulate(std::next(std::begin(output_lengths), NumNonSpatialDim), + std::end(output_lengths), + static_cast(1), + std::multiplies<>()) * + (ds_size + static_cast(2) * C * + std::accumulate(std::next(std::begin(weights_lengths), NumNonSpatialDim), + std::end(weights_lengths), + static_cast(1), + std::multiplies<>())); +} + +template +std::size_t GetTensorSize(const std::array& lengths) +{ + + return std::accumulate(std::begin(lengths), + std::end(lengths), + static_cast(1), + std::multiplies()); +} + +template +std::size_t +GetInputByte(const std::array& input_lengths) +{ + // sizeof(InDataType) * (G * N * C * ) + + return sizeof(InDataType) * GetTensorSize(input_lengths); +} + +template +std::size_t +GetWeightByte(const std::array& weights_lengths) +{ + // sizeof(WeiDataType) * (G * K * C * ) + + return sizeof(WeiDataType) * GetTensorSize(weights_lengths); +} + +template +std::size_t +GetOutputByte(const std::array& output_lengths) +{ + // sizeof(OutDataType) * (G * N * K * ); + return sizeof(OutDataType) * GetTensorSize(output_lengths); +} + +template +bool ConvolutionScale(SimpleDeviceMem& in, + SimpleDeviceMem& wei, + SimpleDeviceMem& out, + ConvElementOp elementwise_op, + const std::array& in_lengths, + const std::array& in_strides, + const std::array& wei_lengths, + const std::array& wei_strides, + const std::array& out_lengths, + const std::array& out_strides); + +template +bool TensorScaleConvert(SimpleDeviceMem& in, + SimpleDeviceMem& out, + float scale_out, + const std::array& lengths, + const std::array& strides); + +template +bool TensorFullReduction(SimpleDeviceMem& tensor, + SimpleDeviceMem& out_amax, + const std::array& lengths, + const std::array& strides); + +template +bool run_grouped_conv_fwd_convscale_reduce( + std::array in_lengths, + std::array wei_lengths, + std::array out_lengths) +{ + + namespace ctc = ck::tensor_layout::convolution; + static_assert(NumDimSpatial == 3 && ck::is_same_v && + ck::is_same_v && + ck::is_same_v, + "Unsupported configuration"); + + const ck::index_t G = in_lengths[4]; + const ck::index_t N = in_lengths[0]; + const ck::index_t K = wei_lengths[1]; + const ck::index_t C = in_lengths[5]; + const ck::index_t Z = wei_lengths[2]; + const ck::index_t Y = wei_lengths[3]; + const ck::index_t X = wei_lengths[4]; + const ck::index_t Di = in_lengths[1]; + const ck::index_t Hi = in_lengths[2]; + const ck::index_t Wi = in_lengths[3]; + const ck::index_t Do = out_lengths[1]; + const ck::index_t Ho = out_lengths[2]; + const ck::index_t Wo = out_lengths[3]; + + const std::size_t in_mem_size = sizeof(InDataType) * N * Di * Hi * Wi * G * C; + const std::size_t wei_mem_size = sizeof(WeiDataType) * G * K * Z * Y * X * C; + const std::size_t conv_out_mem_size = sizeof(ConvOutDataType) * N * Do * Ho * Wo * G * K; + const std::size_t out_mem_size = sizeof(OutDataType) * N * Do * Ho * Wo * G * K; + + SimpleDeviceMem in(in_mem_size); + SimpleDeviceMem wei(wei_mem_size); + SimpleDeviceMem conv_out(conv_out_mem_size); + SimpleDeviceMem out(out_mem_size); + + float scale_in = float(std::rand()) / float(RAND_MAX); + float scale_wei = float(std::rand()) / float(RAND_MAX); + float scale_out = float(std::rand()) / float(RAND_MAX); + + // We have NDHWGC/GKZYXC/NDHWGK (x, weight, y) in memory space. + // However, CK's API only accepts lengths and strides with order of GNCDHW/GKCZYX/GNKDHW. + // Hence, we need to adjust the order of strides. + const std::array input_lengths{G, N, C, Di, Hi, Wi}; + const std::array input_strides{ + C, Di * Hi * Wi * G * C, 1, Hi * Wi * G * C, Wi * G * C, G * C}; + const std::array weights_lengths{G, K, C, Z, Y, X}; + const std::array weights_strides{ + K * Z * Y * X * C, Z * Y * X * C, 1, Y * X * C, X * C, C}; + const std::array output_lengths{G, N, K, Do, Ho, Wo}; + const std::array output_strides{ + K, Do * Ho * Wo * G * K, 1, Ho * Wo * G * K, Wo * G * K, G * K}; + + /* + * FP8 Convolution with Scaling + */ + std::cout << "\n\nConvolution with scale Benchmarking:" << std::endl; + auto elementwise_op = ConvElementOp{ew::Scale{scale_in}, ew::Scale{scale_wei}, {}}; + auto conv_ok = ConvolutionScale(in, + wei, + conv_out, + elementwise_op, + input_lengths, + input_strides, + weights_lengths, + weights_strides, + output_lengths, + output_strides); + + if(!conv_ok) + return false; + + /* + * Scale with output weight and convert to FP8 + */ + std::cout << "\n\nElement-wise scale + convert Benchmarking:" << std::endl; + auto elem_wise_ok = TensorScaleConvert( + conv_out, out, scale_out, output_lengths, output_strides); + + if(!elem_wise_ok) + return false; + + /* + * Compute AMAX + */ + std::cout << "\n\nAMAX Benchmarking:" << std::endl; + SimpleDeviceMem amax_device(sizeof(ConvOutDataType)); + auto reduction_ok = + TensorFullReduction(conv_out, amax_device, output_lengths, output_strides); + + if(!reduction_ok) + return false; + + return true; +} + +template +bool ConvolutionScale(SimpleDeviceMem& in, + SimpleDeviceMem& wei, + SimpleDeviceMem& out, + ConvElementOp elementwise_op, + const std::array& in_lengths, + const std::array& in_strides, + const std::array& wei_lengths, + const std::array& wei_strides, + const std::array& out_lengths, + const std::array& out_strides) +{ + + const std::array conv_filter_strides{1, 1, 1}; + const std::array conv_filter_dilations{1, 1, 1}; + const std::array input_left_pads{1, 1, 1}; + const std::array input_right_pads{1, 1, 1}; + + const auto in_mem_size = GetInputByte(in_lengths); + const auto wei_mem_size = GetWeightByte(wei_lengths); + const auto out_mem_size = GetOutputByte(out_lengths); + + std::size_t ds_size = 2; // 2 element-wise scale multipliers + if constexpr(ck::is_same_v) + { + ds_size += 1; // +1 element-wise relu + } + std::size_t flop = GetFlops(out_lengths, wei_lengths, ds_size); + std::size_t num_bytes = + in_mem_size + wei_mem_size + sizeof(float) + sizeof(float) + out_mem_size; + + using ConvDeviceOp = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD, + OutLayout, + InDataType, + WeiDataType, + ck::Tuple<>, + OutDataType, + PassThrough, + PassThrough, + ConvElementOp, + AComputeType, + BComputeType>; + // get device op instances + const auto conv_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + ConvDeviceOp>::GetInstances(); + + std::cout << "found " << conv_ptrs.size() << " instances" << std::endl; + + std::string conv_best_op_name; + int conv_best_op_id = -1; + float conv_best_avg_time = std::numeric_limits::max(); + float conv_best_gb_per_sec = 0; + float conv_best_tflops = 0; + + // profile device operation instances + std::cout << "Run all convolution instances and do timing" << std::endl; + + for(int i = 0; i < conv_ptrs.size(); ++i) + { + auto& op_ptr = conv_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer( + in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + std::array{}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + std::array, 0>{}, + std::array, 0>{}, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + elementwise_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > conv_best_tflops) + { + conv_best_op_id = i; + conv_best_op_name = op_name; + conv_best_avg_time = avg_time; + conv_best_gb_per_sec = gb_per_sec; + conv_best_tflops = tflops; + } + } + else + { + std::cerr << op_name << " does not support this problem" << std::endl; + } + } + + if(conv_best_op_id < 0) + { + std::cerr << "no suitable instance" << std::endl; + return false; + } + + std::cout << "Best Perf: " << std::setw(10) << conv_best_avg_time << " ms, " << conv_best_tflops + << " TFlops, " << conv_best_gb_per_sec << " GB/s, " << conv_best_op_name << std::endl; + + // run the best instance + { + auto& op_ptr = conv_ptrs[conv_best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer( + in.GetDeviceBuffer(), + wei.GetDeviceBuffer(), + std::array{}, + out.GetDeviceBuffer(), + in_lengths, + in_strides, + wei_lengths, + wei_strides, + std::array, 0>{}, + std::array, 0>{}, + out_lengths, + out_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + PassThrough{}, + PassThrough{}, + elementwise_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return true; +} + +template +bool TensorScaleConvert(SimpleDeviceMem& in, + SimpleDeviceMem& out, + float scale_out, + const std::array& lengths, + const std::array& strides) +{ + + const auto tensor_size = GetTensorSize(lengths); + + const std::size_t in_mem_size = sizeof(InDataType) * tensor_size; + const std::size_t out_mem_size = sizeof(OutDataType) * tensor_size; + + std::size_t flop = 2 * tensor_size; // element-wise scale + convert + + std::size_t bytes = + in_mem_size + sizeof(float) + out_mem_size; // read from in, scale, write to out + + using DeviceScaleConvert = + ck::tensor_operation::device::DeviceElementwise, + ck::Tuple, + ew::Scale, + NumDimSpatial + NumNonSpatialDim>; + + // get device op instances + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceScaleConvert>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_avg_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + float best_tflops = 0; + + // profile device operation instances + std::cout << "Run all DeviceScaleConvert instances and do timing" << std::endl; + + auto scale_convert = ew::Scale{scale_out}; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + auto argument_ptr = op_ptr->MakeArgumentPointer(lengths, + {strides}, + {strides}, + {in.GetDeviceBuffer()}, + {out.GetDeviceBuffer()}, + scale_convert); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = bytes / 1.E6 / avg_time; + + std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << tflops << " TFlops, " + << gb_per_sec << " GB/s, " << op_name << std::endl; + + if(tflops > best_tflops) + { + best_op_id = i; + best_op_name = op_name; + best_avg_time = avg_time; + best_gb_per_sec = gb_per_sec; + best_tflops = tflops; + } + } + else + { + std::cerr << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance found." << std::endl; + return false; + } + else + { + std::cout << "Best Perf: " << std::setw(10) << best_avg_time << " ms, " << best_tflops + << " TFlops, " << best_gb_per_sec << " GB/s, " << best_op_name << std::endl; + + // run the best intance + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(lengths, + {strides}, + {strides}, + {in.GetDeviceBuffer()}, + {out.GetDeviceBuffer()}, + scale_convert); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return true; +} + +template +bool TensorFullReduction(SimpleDeviceMem& tensor, + SimpleDeviceMem& out_amax, + const std::array& lengths, + const std::array& strides) +{ + const auto spatial_dim_size = std::accumulate(std::next(std::begin(lengths), NumNonSpatialDim), + std::end(lengths), + static_cast(1), + std::multiplies<>()); + const auto tensor_size = GetTensorSize(lengths); + + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; + + // Get the reduction operation + using ReduceOperation = typename ck::reduce_binary_operator::opType; + using InElementwiseOperation = + typename ck::reduce_unary_operator::InElementwiseOperation; + using AccElementwiseOperation = + typename ck::reduce_unary_operator::AccElementwiseOperation; + + InElementwiseOperation in_elementwise_op; + AccElementwiseOperation acc_elementwise_op; + std::tie(in_elementwise_op, acc_elementwise_op) = + ck::reduce_unary_operator::GetElementwiseOperator( + static_cast(tensor_size)); + + std::array reduce_out_lengths{1}; + std::array reduce_out_strides{1}; + + SimpleDeviceMem partial_reduce_tensor(sizeof(OutDataType) * spatial_dim_size); + std::array reduce_part_lengths; + std::copy(std::next(std::begin(lengths), NumNonSpatialDim), + std::end(lengths), + std::begin(reduce_part_lengths)); + std::array reduce_part_strides; + copy(HostTensorDescriptor(reduce_part_lengths).GetStrides(), reduce_part_strides); + + { + std::cout << "\nReduction of nonspatial dimensions:" << std::endl; + using DeviceOp = + ck::tensor_operation::device::DeviceReduce; // OutputIndex + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + std::array reduce_dims; + std::iota(reduce_dims.begin(), reduce_dims.end(), 0); // 0,..., NumNonSpatialDim-1 + + ck::index_t num_in_elements = tensor_size; + ck::index_t num_out_elements = spatial_dim_size; + + // profile device operation instances + std::cout << "Run partial reduction and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(lengths, + strides, + reduce_part_lengths, + reduce_part_strides, + reduce_dims, + 1.0, + 0.0, + tensor.GetDeviceBuffer(), + nullptr, + partial_reduce_tensor.GetDeviceBuffer(), + nullptr, + in_elementwise_op, + PassThrough{}); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + std::size_t num_bytes = + num_in_elements * sizeof(InDataType) + num_out_elements * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec + << " GB/s, " << op_name << std::endl; + + if(ave_time < best_ave_time) + { + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance found." << std::endl; + return false; + } + else + { + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best instance + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(lengths, + strides, + reduce_part_lengths, + reduce_part_strides, + reduce_dims, + 1.0, + 0.0, + tensor.GetDeviceBuffer(), + nullptr, + partial_reduce_tensor.GetDeviceBuffer(), + nullptr, + in_elementwise_op, + PassThrough{}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + } + + { + std::cout << "\nReduction of spatial dimensions:" << std::endl; + using DeviceOp = ck::tensor_operation::device::DeviceReduce; // OutputIndex + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + std::array reduce_dims; + std::iota(reduce_dims.begin(), reduce_dims.end(), 0); // 0,..., NumDimSpatial-1 + + ck::index_t num_in_elements = spatial_dim_size; + ck::index_t num_out_elements = 1; + + // profile device operation instances + std::cout << "Run final reduction and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(reduce_part_lengths, + reduce_part_strides, + reduce_out_lengths, + reduce_out_strides, + reduce_dims, + 1.0, + 0.0, + partial_reduce_tensor.GetDeviceBuffer(), + nullptr, + out_amax.GetDeviceBuffer(), + nullptr, + PassThrough{}, + acc_elementwise_op); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = + num_in_elements * sizeof(OutDataType) + num_out_elements * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec + << " GB/s, " << op_name << std::endl; + + if(ave_time < best_ave_time) + { + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + if(best_op_id < 0) + { + std::cerr << "no suitable instance found." << std::endl; + return false; + } + else + { + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best instance + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(reduce_part_lengths, + reduce_part_strides, + reduce_out_lengths, + reduce_out_strides, + reduce_dims, + 1.0, + 0.0, + partial_reduce_tensor.GetDeviceBuffer(), + nullptr, + out_amax.GetDeviceBuffer(), + nullptr, + PassThrough{}, + acc_elementwise_op); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + } + + return true; +} diff --git a/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp new file mode 100644 index 0000000000..182642c030 --- /dev/null +++ b/client_example/24_grouped_conv_activation/grouped_convnd_fwd_convscale_reduce/conv3d_fwd_convscale_relu_amax_fp8.cpp @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "common.hpp" + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/tensor_layout.hpp" + +using InDataType = ck::f8_t; +using WeiDataType = ck::f8_t; +using CShuffleDataType = float; +using ConvOutDataType = float; // data type of convolution result +using OutDataType = ck::f8_t; // data type of final result +using AComputeDataType = ck::f8_t; +using BComputeDataType = ck::f8_t; + +using ConvElementOp = ConvScaleRelu; + +using InLayout = ck::tensor_layout::convolution::NDHWGC; +using WeiLayout = ck::tensor_layout::convolution::GKZYXC; +using OutLayout = ck::tensor_layout::convolution::NDHWGK; + +constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX; + +static constexpr ck::index_t NumDimSpatial = 3; +static constexpr ck::index_t G = 1; +static constexpr ck::index_t N = 64; +static constexpr ck::index_t K = 128; +static constexpr ck::index_t C = 64; +static constexpr ck::index_t Z = 3; +static constexpr ck::index_t Y = 3; +static constexpr ck::index_t X = 3; +static constexpr ck::index_t Di = 28; +static constexpr ck::index_t Hi = 28; +static constexpr ck::index_t Wi = 3; +static constexpr ck::index_t Do = 28; +static constexpr ck::index_t Ho = 28; +static constexpr ck::index_t Wo = 3; + +int main() +{ + return run_grouped_conv_fwd_convscale_reduce( + {N, Di, Hi, Wi, G, C}, {G, K, Z, Y, X, C}, {N, Do, Ho, Wo, G, K}) + ? EXIT_SUCCESS + : EXIT_FAILURE; +} diff --git a/example/62_convnd_activ/CMakeLists.txt b/example/62_convnd_activ/CMakeLists.txt index 7114b4ad6d..ab136d99ba 100644 --- a/example/62_convnd_activ/CMakeLists.txt +++ b/example/62_convnd_activ/CMakeLists.txt @@ -3,6 +3,7 @@ add_subdirectory(convinvscale) add_subdirectory(convscale) add_subdirectory(convscale_relu) add_subdirectory(convscale_add) +add_subdirectory(convscale_reduce) add_subdirectory(multi_AB) add_subdirectory(unary) diff --git a/example/62_convnd_activ/convscale_reduce/CMakeLists.txt b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt new file mode 100644 index 0000000000..b3c6621509 --- /dev/null +++ b/example/62_convnd_activ/convscale_reduce/CMakeLists.txt @@ -0,0 +1,11 @@ +list(APPEND gpu_list gfx908 gfx90a gfx940 gfx941 gfx942) +set(target 0) +foreach(gpu IN LISTS GPU_TARGETS) + if(gpu IN_LIST gpu_list AND target EQUAL 0) + add_custom_target(example_convnd_activ_xdl_convscale_reduce) + add_example_executable(example_convnd_fwd_xdl_convscale_relu_amax_fp8 convnd_fwd_xdl_convscale_relu_amax_fp8.cpp) + add_example_dependencies(example_convnd_activ_xdl_convscale_reduce example_convnd_fwd_xdl_convscale_relu_amax_fp8 ) + + set(target 1) + endif() +endforeach() diff --git a/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp b/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp new file mode 100644 index 0000000000..6940c20695 --- /dev/null +++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_convscale_reduce_common.hpp @@ -0,0 +1,502 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include + +#include "ck/ck.hpp" + +#include "ck/library/utility/algorithm.hpp" +#include "ck/library/utility/check_err.hpp" +#include "ck/library/utility/device_memory.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/library/utility/host_tensor_generator.hpp" +#include "ck/library/utility/convolution_parameter.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_conv_fwd.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp" +#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_elementwise_dynamic_vector_dims_impl.hpp" +#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" +#include "ck/utility/reduction_operator.hpp" +#include "ck/utility/reduction_enums.hpp" +#include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" +#include "ck/utility/type.hpp" + +namespace ew = ck::tensor_operation::element_wise; + +using PassThrough = ew::PassThrough; +using ConvScaleRelu = ew::UnaryCombinedOp; +using ConvScale = ew::UnaryCombinedOp; + +using UnaryScaleConvert = ew::Scale; + +void print_helper_msg() +{ + std::cout << "arg1: verification (0=no, 1=yes)\n" + << "arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n" + << "arg3: time kernel (0=no, 1=yes)\n" + << ck::utils::conv::get_conv_param_parser_helper_msg() << std::endl; +} + +template +inline __host__ __device__ constexpr double get_rtol() +{ + if constexpr(std::is_same_v) + { + return 1e-3; + } + else if constexpr(std::is_same_v) + { + return 1e-6; + } + else if constexpr(std::is_same_v) + { + return 1e-3; + } + else if constexpr(std::is_same_v) + { + return 5e-2; + } + else if constexpr(std::is_same_v) + { + return 1e-1; + } + else if constexpr(std::is_same_v) + { + return 1e-1; + } + else if constexpr(std::is_same_v) + { + return 1e-1; // 240 and 224 are acceptable + } + else if constexpr(std::is_same_v) + { + return 1.5e-1; // 57344 and 49152 are acceptable + } + else + { + return 1e-3; + } +} + +template +inline __host__ __device__ constexpr double get_atol() +{ + if constexpr(std::is_same_v) + { + return 1e-3; + } + else if constexpr(std::is_same_v) + { + return 1e-6; + } + else if constexpr(std::is_same_v) + { + return 1e-3; + } + else if constexpr(std::is_same_v) + { + return 5e-2; + } + else if constexpr(std::is_same_v) + { + return 1e-1; + } + else if constexpr(std::is_same_v) + { + return 1e-1; + } + else if constexpr(std::is_same_v) + { + return 16.1; // 240 and 224 are acceptable + } + else if constexpr(std::is_same_v) + { + return 8192.1; // 57344 and 49152 are acceptable + } + else + { + return 1e-3; + } +} + +template +bool run_grouped_conv_fwd(bool do_verification, + int init_method, + bool time_kernel, + const ck::utils::conv::ConvParam& conv_param, + const HostTensorDescriptor& in_g_n_c_wis_desc, + const HostTensorDescriptor& wei_g_k_c_xs_desc, + const HostTensorDescriptor& out_g_n_k_wos_desc, + const InElementOp& in_element_op, + const WeiElementOp& wei_element_op) +{ + Tensor in(in_g_n_c_wis_desc); + Tensor wei(wei_g_k_c_xs_desc); + Tensor host_conv(out_g_n_k_wos_desc); + Tensor device_conv(out_g_n_k_wos_desc); + Tensor out_host(out_g_n_k_wos_desc); + Tensor out_device(out_g_n_k_wos_desc); + + std::cout << "in: " << in.mDesc << std::endl; + std::cout << "wei: " << wei.mDesc << std::endl; + std::cout << "out: " << out_host.mDesc << std::endl; + + switch(init_method) + { + case 0: break; + case 1: + in.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + wei.GenerateTensorValue(GeneratorTensor_2{-5, 5}); + break; + case 11: // used for debugging + in.GenerateTensorValue(GeneratorTensor_1{1}); + wei.GenerateTensorValue(GeneratorTensor_1{1}); + break; + default: + in.GenerateTensorValue(GeneratorTensor_3{-1.0, 1.0}); + wei.GenerateTensorValue(GeneratorTensor_3{-0.5, 0.5}); + } + + DeviceMem in_device_buf(sizeof(InDataType) * in.mDesc.GetElementSpaceSize()); + DeviceMem wei_device_buf(sizeof(WeiDataType) * wei.mDesc.GetElementSpaceSize()); + DeviceMem conv_device_buf(conv_param.GetOutputByte()); + DeviceMem out_device_buf(conv_param.GetOutputByte()); + + in_device_buf.ToDevice(in.mData.data()); + wei_device_buf.ToDevice(wei.mData.data()); + + std::array a_g_n_c_wis_lengths{}; + std::array a_g_n_c_wis_strides{}; + std::array b_g_k_c_xs_lengths{}; + std::array b_g_k_c_xs_strides{}; + std::array e_g_n_k_wos_lengths{}; + std::array e_g_n_k_wos_strides{}; + std::array conv_filter_strides{}; + std::array conv_filter_dilations{}; + std::array input_left_pads{}; + std::array input_right_pads{}; + + auto copy = [](const auto& x, auto& y) { ck::ranges::copy(x, y.begin()); }; + + copy(in_g_n_c_wis_desc.GetLengths(), a_g_n_c_wis_lengths); + copy(in_g_n_c_wis_desc.GetStrides(), a_g_n_c_wis_strides); + copy(wei_g_k_c_xs_desc.GetLengths(), b_g_k_c_xs_lengths); + copy(wei_g_k_c_xs_desc.GetStrides(), b_g_k_c_xs_strides); + copy(out_g_n_k_wos_desc.GetLengths(), e_g_n_k_wos_lengths); + copy(out_g_n_k_wos_desc.GetStrides(), e_g_n_k_wos_strides); + copy(conv_param.conv_filter_strides_, conv_filter_strides); + copy(conv_param.conv_filter_dilations_, conv_filter_dilations); + copy(conv_param.input_left_pads_, input_left_pads); + copy(conv_param.input_right_pads_, input_right_pads); + + // random scale values + float scale_in = float(std::rand()) / float(RAND_MAX); + float scale_wei = float(std::rand()) / float(RAND_MAX); + float scale_out = float(std::rand()) / float(RAND_MAX); + + std::cout << std::endl; + std::cout << "scale_in: " << scale_in << std::endl; + std::cout << "scale_wei: " << scale_wei << std::endl; + std::cout << "scale_out: " << scale_out << std::endl; + + // convolution elementwise operation + auto conv_element_op = ConvElementOp{ew::Scale{scale_in}, ew::Scale{scale_wei}, {}}; + auto scale_convert = UnaryScaleConvert{scale_out}; // elementwise scale and type cast + + // do Conv + auto conv = DeviceConvNDFwdInstance{}; + auto conv_invoker = conv.MakeInvoker(); + auto conv_argument = + conv.MakeArgument(in_device_buf.GetDeviceBuffer(), + wei_device_buf.GetDeviceBuffer(), + std::array{}, + conv_device_buf.GetDeviceBuffer(), + a_g_n_c_wis_lengths, + a_g_n_c_wis_strides, + b_g_k_c_xs_lengths, + b_g_k_c_xs_strides, + std::array, 0>{}, + std::array, 0>{}, + e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + conv_filter_strides, + conv_filter_dilations, + input_left_pads, + input_right_pads, + in_element_op, + wei_element_op, + conv_element_op); + + if(!conv.IsSupportedArgument(conv_argument)) + { + throw std::runtime_error( + "wrong! device_conv with the specified compilation parameters does " + "not support this Conv problem"); + } + + std::string kernels = conv.GetTypeString(); + + float avg_time = conv_invoker.Run(conv_argument, StreamConfig{nullptr, time_kernel}); + + using DeviceElementwiseScale = ck::tensor_operation::device::DeviceElementwiseImpl< + ck::Tuple, // InDataTypeTuple + ck::Tuple, // OutDataTypeTuple + UnaryScaleConvert, // UnaryScaleConvert + NDimSpatial + 3, // NumDim + 256, // BlockSize + 128, // M0PerBlock + 128, // M1PerBlock + 8, // M0PerThread + 8, // M1PerThread + ck::Sequence<1, 0>, // ThreadClusterArrangeOrder + ck::Sequence<8>, // InScalarPerVectorSeq + ck::Sequence<8>>; // OutScalarPerVectorSeq + + auto device_ew_scale = DeviceElementwiseScale{}; + auto scale_invoker = device_ew_scale.MakeInvoker(); + auto scale_argument = device_ew_scale.MakeArgument(e_g_n_k_wos_lengths, + {e_g_n_k_wos_strides}, + {e_g_n_k_wos_strides}, + {conv_device_buf.GetDeviceBuffer()}, + {out_device_buf.GetDeviceBuffer()}, + scale_convert); + + if(!device_ew_scale.IsSupportedArgument(scale_argument)) + { + throw std::runtime_error( + "wrong! DeviceElementwiseScale with the specified compilation parameters does " + "not support this problem"); + } + + kernels += std::string("\n\t\t ") + device_ew_scale.GetTypeString(); + + avg_time += scale_invoker.Run(scale_argument, StreamConfig{nullptr, time_kernel}); + + constexpr auto ReduceOpId = ck::ReduceTensorOp::AMAX; + using ReduceOperation = typename ck::reduce_binary_operator::opType; + using InElementwiseOperation = + typename ck::reduce_unary_operator::InElementwiseOperation; + using AccElementwiseOperation = + typename ck::reduce_unary_operator::AccElementwiseOperation; + using DeviceReduceInstance = + ck::tensor_operation::device::DeviceReduceMultiBlock; // OutDstVectorSize + + std::vector outLengths = {1}; + Tensor amax_host(outLengths); + Tensor amax_from_device(outLengths); + auto amax_host_strides = amax_host.mDesc.GetStrides(); + + std::array reduce_dims; + std::iota(reduce_dims.begin(), reduce_dims.end(), 0); // 0,..., NDimSpatial+3-1 + + std::array reduce_out_lengths{1}; + std::array reduce_out_strides{static_cast(amax_host_strides[0])}; + + DeviceMem amax_device(sizeof(ConvOutDataType) * amax_host.mDesc.GetElementSpaceSize()); + DeviceMem index_device; + + InElementwiseOperation in_elementwise_op; + AccElementwiseOperation acc_elementwise_op; + std::tie(in_elementwise_op, acc_elementwise_op) = + ck::reduce_unary_operator::GetElementwiseOperator( + static_cast(host_conv.mDesc.GetElementSize())); + + // Hack convolution output strides for reduction as kernel expects stride 1 for the last + // dimension. It only works because the reduction is done on the whole tensor and result is + // independent of the order of elements. + std::array reduction_strides{}; + copy(HostTensorDescriptor(e_g_n_k_wos_lengths).GetStrides(), reduction_strides); + + auto device_reduce = DeviceReduceInstance{}; + auto reduce_invoker = device_reduce.MakeInvokerPointer(); + auto reduce_argument = device_reduce.MakeArgumentPointer(e_g_n_k_wos_lengths, + reduction_strides, + reduce_out_lengths, + reduce_out_strides, + reduce_dims, + 1.0, + 0.0, + conv_device_buf.GetDeviceBuffer(), + nullptr, + amax_device.GetDeviceBuffer(), + nullptr, + in_elementwise_op, + acc_elementwise_op); + + if(!device_reduce.IsSupportedArgument(reduce_argument.get())) + { + throw std::runtime_error( + "wrong! DeviceReduceInstance with the specified compilation parameters does " + "not support this runtime parameters!"); + }; + + kernels += std::string("\n\t\t ") + device_reduce.GetTypeString(); + + float reduce_time = + reduce_invoker->Run(reduce_argument.get(), StreamConfig{nullptr, time_kernel}); + + if(time_kernel) + std::cout << "\nReduce time: " << reduce_time << " ms" << std::endl; + + avg_time += reduce_time; + + std::size_t flop = conv_param.GetFlops(); // convolution FLOPs + auto conv_out_elems = host_conv.GetElementSize(); // number of elements in conv result tensor + + // 3 element-wise scale multipliers + 1 AMAX + std::size_t elementwise_ops = 3 + 1; + if constexpr(ck::is_same_v) + { + elementwise_ops += 1; // +1 element-wise relu + } + + flop += elementwise_ops * conv_out_elems; + + // convolution + elementwise scaling (in + wei + output byte count) + std::size_t num_btype = conv_param.GetByte(); + num_btype += sizeof(float) + sizeof(float); // + 2 scales + + // elementwise scaling + F8 conversion + num_btype += conv_param.GetOutputByte() + sizeof(float) + + conv_param.GetOutputByte(); + + // AMAX + num_btype += conv_param.GetOutputByte() + sizeof(float); + + if(time_kernel) + { + float tflops = static_cast(flop) / 1.E9 / avg_time; + float gb_per_sec = num_btype / 1.E6 / avg_time; + std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec + << " GB/s, " << std::endl; + } + + std::cout << "\nKernels: " << kernels << std::endl; + + if(do_verification) + { + auto ref_conv = ck::tensor_operation::host::ReferenceConvFwd(); + + auto ref_invoker = ref_conv.MakeInvoker(); + auto ref_argument = ref_conv.MakeArgument(in, + wei, + host_conv, + conv_param.conv_filter_strides_, + conv_param.conv_filter_dilations_, + conv_param.input_left_pads_, + conv_param.input_right_pads_, + in_element_op, + wei_element_op, + conv_element_op); + + ref_invoker.Run(ref_argument); + + conv_device_buf.FromDevice(device_conv.mData.data()); + + out_device_buf.FromDevice(out_device.mData.data()); + + out_host.ForEach([&](auto&, auto idx) { scale_convert(out_host(idx), host_conv(idx)); }); + + std::cout << "\nComparing output to reference: " << std::endl; + auto tight_tol_check = ck::utils::check_err(out_device, out_host, "Error: "); + if(!tight_tol_check) + { + std::cout << "\n\tRecompare applying tolerances...\n"; + std::cout << "\t\trtol = " << get_rtol() << std::endl; + std::cout << "\t\tatol = " << get_atol() << std::endl; + auto loose_tol_check = ck::utils::check_err(out_device, + out_host, + "Error: incorrect convolution results!", + get_rtol(), + get_atol()); + if(!loose_tol_check) + { + return false; + } + } + std::cout << "Success!" << std::endl; + + /// Verify AMAX + + using RefReduceInstance = + ck::tensor_operation::host::ReferenceReduce; + + auto ref_reduce = RefReduceInstance{}; + auto ref_reduce_invoker = ref_reduce.MakeInvokerPointer(); + auto ref_reduce_argument = ref_reduce.MakeArgumentPointer(e_g_n_k_wos_lengths, + e_g_n_k_wos_strides, + reduce_out_lengths, + reduce_out_strides, + reduce_dims, + 1.0, + 0.0, + host_conv.mData.data(), + nullptr, + amax_host.mData.data(), + nullptr, + in_elementwise_op, + acc_elementwise_op); + + if(!ref_reduce.IsSupportedArgument(ref_reduce_argument.get())) + { + throw std::runtime_error( + "wrong! RefReduceInstance with the specified compilation parameters does " + "not support this runtime parameters!"); + }; + + ref_reduce_invoker->Run(ref_reduce_argument.get()); + + amax_device.FromDevice(amax_from_device.mData.data()); + + std::cout << "\namax: " << amax_from_device.mData[0] << std::endl; + std::cout << "amax_ref: " << amax_host.mData[0] << std::endl; + + return ck::utils::check_err(amax_from_device, amax_host, "Error: incorrect AMAX results!"); + } + + return true; +} diff --git a/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp new file mode 100644 index 0000000000..df6bf7bd5c --- /dev/null +++ b/example/62_convnd_activ/convscale_reduce/convnd_fwd_xdl_convscale_relu_amax_fp8.cpp @@ -0,0 +1,82 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "convnd_fwd_convscale_reduce_common.hpp" + +#include "ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp" + +using InDataType = ck::f8_t; +using WeiDataType = ck::f8_t; +using AccDataType = float; +using CShuffleDataType = float; +using ConvOutDataType = float; // data type of convolution result +using OutDataType = ck::f8_t; // data type of final result +using AComputeDataType = ck::f8_t; +using BComputeDataType = ck::f8_t; + +template +using S = ck::Sequence; + +using InElementOp = PassThrough; +using WeiElementOp = PassThrough; +using OutElementOp = ConvScaleRelu; + +static constexpr auto ConvSpec = + ck::tensor_operation::device::ConvolutionForwardSpecialization::Default; + +static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKPadding; + +template +using DeviceGroupedConvNDFwdInstance = + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle< + NDimSpatial, + InLayout, + WeiLayout, + ck::Tuple<>, + OutLayout, + InDataType, + WeiDataType, + AccDataType, + CShuffleDataType, + ck::Tuple<>, + ConvOutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + ConvSpec, // ConvForwardSpecialization + GemmSpec, // GemmSpecialization + 1, // + 256, // BlockSize + 128, // MPerBlock + 256, // NPerBlock + 32, // KPerBlock + 8, // AK1 + 8, // BK1 + 32, // MPerXdl + 32, // NPerXdl + 2, // MXdlPerWave + 4, // NXdlPerWave + S<4, 64, 1>, // ABlockTransferThreadClusterLengths_AK0_M_AK1 + S<1, 0, 2>, // ABlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // ABlockTransferSrcAccessOrder + 2, // ABlockTransferSrcVectorDim + 8, // ABlockTransferSrcScalarPerVector + 8, // ABlockTransferDstScalarPerVector_AK1 + 1, // ABlockLdsExtraM + S<4, 64, 1>, // BBlockTransferThreadClusterLengths_BK0_N_BK1 + S<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder + S<1, 0, 2>, // BBlockTransferSrcAccessOrder + 2, // BBlockTransferSrcVectorDim + 8, // BBlockTransferSrcScalarPerVector + 8, // BBlockTransferDstScalarPerVector_BK1 + 1, // BBlockLdsExtraN + 1, + 1, + S<1, 32, 1, 8>, + 8, + AComputeDataType, + BComputeDataType>; + +#include "run_convnd_fwd_example.inc" + +int main(int argc, char* argv[]) { return run_convnd_fwd_example(argc, argv) ? 0 : 1; } diff --git a/example/62_convnd_activ/convscale_reduce/run_convnd_fwd_example.inc b/example/62_convnd_activ/convscale_reduce/run_convnd_fwd_example.inc new file mode 100644 index 0000000000..24775f21b5 --- /dev/null +++ b/example/62_convnd_activ/convscale_reduce/run_convnd_fwd_example.inc @@ -0,0 +1,98 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +bool run_convnd_fwd_example(int argc, char* argv[]) +{ + print_helper_msg(); + + bool do_verification = true; + int init_method = 1; + bool time_kernel = false; + + ck::utils::conv::ConvParam conv_param{ + 2, 1, 128, 256, 192, {3, 3}, {71, 71}, {2, 2}, {1, 1}, {1, 1}, {1, 1}}; + + if(argc == 1) + { + // use default + } + else if(argc == 4) + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + } + else + { + do_verification = std::stoi(argv[1]); + init_method = std::stoi(argv[2]); + time_kernel = std::stoi(argv[3]); + const ck::index_t num_dim_spatial = std::stoi(argv[4]); + + conv_param = ck::utils::conv::parse_conv_param(num_dim_spatial, 5, argv); + } + + // instantiate in and wei element ops, will + // instantiate out_element_op below for every iteration + const auto in_element_op = InElementOp{}; + const auto wei_element_op = WeiElementOp{}; + + const auto run = [&](auto ndim_spatial, auto in_layout, auto wei_layout, auto out_layout) { + constexpr ck::index_t ndim_spatial_value = ndim_spatial.value; + + using InLayout = decltype(in_layout); + using WeiLayout = decltype(wei_layout); + using OutLayout = decltype(out_layout); + + const auto in_g_n_c_wis_desc = + ck::utils::conv::make_input_host_tensor_descriptor_g_n_c_wis_packed( + conv_param); + + const auto wei_g_k_c_xs_desc = + ck::utils::conv::make_weight_host_tensor_descriptor_g_k_c_xs_packed( + conv_param); + + const auto out_g_n_k_wos_desc = + ck::utils::conv::make_output_host_tensor_descriptor_g_n_k_wos_packed( + conv_param); + + return run_grouped_conv_fwd< + ndim_spatial_value, + InDataType, + WeiDataType, + ConvOutDataType, + OutDataType, + InElementOp, + WeiElementOp, + OutElementOp, + DeviceGroupedConvNDFwdInstance>( + do_verification, + init_method, + time_kernel, + conv_param, + in_g_n_c_wis_desc, + wei_g_k_c_xs_desc, + out_g_n_k_wos_desc, + in_element_op, + wei_element_op); + }; + + namespace ctc = ck::tensor_layout::convolution; + + if(conv_param.num_dim_spatial_ == 1) + { + return run(ck::Number<1>{}, ctc::GNWC{}, ctc::GKXC{}, ctc::GNWK{}); + } + else if(conv_param.num_dim_spatial_ == 2) + { + return run(ck::Number<2>{}, ctc::GNHWC{}, ctc::GKYXC{}, ctc::GNHWK{}); + } + else if(conv_param.num_dim_spatial_ == 3) + { + return run(ck::Number<3>{}, ctc::GNDHWC{}, ctc::GKZYXC{}, ctc::GNDHWK{}); + } + + return true; +} diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp index e3bec17514..89e9b2e763 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp @@ -184,6 +184,43 @@ using device_grouped_conv_fwd_xdl_outelementop_bf8_f8_instances = std::tuple< // clang-format on >; +template +using device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances = std::tuple< +// clang-format off + //########################################| NumDim| A| B| Ds| E| AData| BData| AccData| CShuffle| Ds| EData| A| B| CDE| ConvForward| GEMM| NumGemmK| Block| MPer| NPer| KPer| AK1| BK1| MPer| NPer| MXdl| NXdl| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds| BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| CShuffle| CShuffle| CBlockTransferClusterLengths| CBlockTransfer| Compute| Compute| + //########################################| Spatial| Layout| Layout| Layout| Layout| Type| Type| Type| DataType| DataType| Type| Elementwise| Elementwise| Elementwise| Specialization| Specialization| Prefetch| Size| Block| Block| Block| | | XDL| XDL| Per| Per| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraM| ThreadCluster| ThreadCluster| SrcAccessOrder| SrcVectorDim| SrcScalar| DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave| _MBlock_MWaveMPerXdl| ScalarPerVector| TypeA| TypeB| + //########################################| | | | | | | | | | | | Operation| Operation| Operation| | | Stage| | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerXdl| _NWaveNPerXdl| | | + //########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | +#ifdef CK_ENABLE_FP8 + // generic instance + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, F8, F8>, + // instances for small conv.K and conv.C + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 1, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 1, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, F8, F8>, + + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 256, 256, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 256, 32, 8, 8, 32, 32, 2, 4, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 128, 32, 8, 8, 32, 32, 4, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 128, 64, 128, 32, 8, 8, 32, 32, 2, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 64, 32, 8, 8, 32, 32, 2, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 256, 128, 64, 32, 8, 8, 32, 32, 2, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 256, 64, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 64, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 8>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 128, 128, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 32, 1, 4>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 128, 32, 128, 32, 8, 8, 32, 32, 1, 2, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 32, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 8>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 64, 64, 32, 32, 8, 8, 32, 32, 2, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, F8, F8>, + DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle, F32, PassThrough, PassThrough, OutElementOp, ConvSpec, GemmMNKPadding, 1, 64, 32, 64, 32, 8, 8, 32, 32, 1, 2, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, S<4, 16, 1>, S<1, 0, 2>, S<1, 0, 2>, 2, 8, 8, 1, 1, 1, S<1, 16, 1, 4>, 8, F8, F8> +#endif + // clang-format on + >; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp index ad86d066f7..419f5a609a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_forward_convscale_relu.hpp @@ -8,6 +8,7 @@ #include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp" +#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" #include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" @@ -99,6 +100,89 @@ struct DeviceOperationInstanceFactory< } }; +namespace ew = ck::tensor_operation::element_wise; +using CombConvScaleRelu = ew::UnaryCombinedOp; + +#ifdef CK_ENABLE_FP8 +void add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances( + std::vector, + NDHWGK, + F8, + F8, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + CombConvScaleRelu, + F8, + F8>>>& instances); +#endif + +template +struct DeviceOperationInstanceFactory< + ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD> +{ + using DeviceOp = DeviceGroupedConvFwdMultipleABD; + + static auto GetInstances() + { + std::vector> op_ptrs; + if constexpr(NumDimSpatial == 3 && is_same_v && + is_same_v && is_same_v) + { +#ifdef CK_ENABLE_FP8 + if constexpr(is_same_v && is_same_v && + is_same_v && is_same_v && + is_same_v) + { + add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances( + op_ptrs); + } +#endif + } + return op_ptrs; + } +}; + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/permute_scale.hpp b/library/include/ck/library/tensor_operation_instance/gpu/permute_scale.hpp index 4f5d022f9c..eb71f9d8e5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/permute_scale.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/permute_scale.hpp @@ -70,6 +70,12 @@ void add_device_permute_scale_6d_f32_instances( DeviceElementwise, ck::Tuple, element_wise::Scale, 6>>>&); #endif +#ifdef CK_ENABLE_FP8 +void add_device_permute_scale_6d_f32_f8_instances( + std::vector, ck::Tuple, element_wise::Scale, 6>>>&); +#endif + template > && + is_same_v>) + { + add_device_permute_scale_6d_f32_f8_instances(op_ptrs); + } #endif } return op_ptrs; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp b/library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp index 8a22005413..204c9a310d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp @@ -10,6 +10,7 @@ namespace tensor_operation { namespace device { namespace instance { +using F8 = ck::f8_t; using F16 = ck::half_t; using F32 = float; @@ -183,6 +184,51 @@ using device_permute_scale_f32_instances = std::tuple< DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 32, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 16, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>> >; + +#ifdef CK_ENABLE_FP8 +template +using device_permute_scale_f32_f8_instances = std::tuple< + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 128, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 32, 128, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 64, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 32, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 16, 128, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 128, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 16, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 64, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 32, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 16, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<4>, ck::Sequence<4>>, + + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 128, 128, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 256, 64, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 64, 256, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 128, 64, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 64, 128, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 32, 256, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 256, 32, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 64, 64, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 32, 128, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 128, 32, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 64, 32, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 32, 64, 8, 8, ck::Sequence<1, 0>, ck::Sequence<8>, ck::Sequence<8>>, + + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 64, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 128, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 256, 32, 128, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 64, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 32, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 16, 128, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 128, 128, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 32, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 16, 64, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 64, 64, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 32, 16, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>>, + DeviceElementwiseImpl, ck::Tuple, ElementwiseOp, NDims, 32, 16, 32, 4, 4, ck::Sequence<1, 0>, ck::Sequence<1>, ck::Sequence<1>> + >; +#endif // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp index ec3bc852e8..142d3f4227 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp @@ -14,15 +14,24 @@ namespace device { namespace instance { // clang-format off -// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 6, 6, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 5, 5, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 6, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 5, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 3, 3, ReduceAMax, PassThrough, PassThrough, true, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 2, 2, ReduceAMax, PassThrough, PassThrough, true, false>(std::vector>&); +extern template void add_device_reduce_instance_blockwise< F32, F32, F32, 1, 1, ReduceAMax, PassThrough, PassThrough, true, false>(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp index 472da0da78..1fda1f4ee6 100644 --- a/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/grouped_conv3d_fwd_convscale_relu/xdl/device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_instance.cpp @@ -3,6 +3,7 @@ #include "ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_outelementop_instance.hpp" #include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" +#include "ck/tensor_operation/gpu/element/combined_element_wise_operation.hpp" #include "ck/tensor_operation/gpu/element/unary_element_wise_operation.hpp" namespace ck { @@ -57,6 +58,54 @@ void add_device_grouped_conv3d_fwd_xdl_convscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_in ConvScaleRelu>{}); } +namespace ew = ck::tensor_operation::element_wise; +using CombConvScaleRelu = ew::UnaryCombinedOp; + +void add_device_grouped_conv3d_fwd_xdl_combconvscale_relu_ndhwgc_gkzyxc_ndhwgk_f8_f8_f32_instances( + std::vector, + NDHWGK, + F8, + F8, + ck::Tuple<>, + F32, + PassThrough, + PassThrough, + CombConvScaleRelu, + F8, + F8>>>& instances) +{ + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, + NDHWGC, + GKZYXC, + ck::Tuple<>, + NDHWGK, + ConvFwdDefault, + CombConvScaleRelu>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, + NDHWGC, + GKZYXC, + ck::Tuple<>, + NDHWGK, + ConvFwd1x1P0, + CombConvScaleRelu>{}); + add_device_operation_instances( + instances, + device_grouped_conv_fwd_xdl_outelementop_f8_f8_f32_instances<3, + NDHWGC, + GKZYXC, + ck::Tuple<>, + NDHWGK, + ConvFwd1x1S1P0, + CombConvScaleRelu>{}); +} + } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt index fc0da56a96..427bf54ca1 100644 --- a/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/permute_scale/CMakeLists.txt @@ -1,4 +1,4 @@ -add_instance_library(device_permute_scale_instance +add_instance_library(device_permute_scale_instance device_permute_scale_1d_fp16_instances.cpp device_permute_scale_2d_fp16_instances.cpp device_permute_scale_3d_fp16_instances.cpp @@ -10,4 +10,5 @@ add_instance_library(device_permute_scale_instance device_permute_scale_3d_fp32_instances.cpp device_permute_scale_4d_fp32_instances.cpp device_permute_scale_5d_fp32_instances.cpp - device_permute_scale_6d_fp32_instances.cpp) + device_permute_scale_6d_fp32_instances.cpp + device_permute_scale_6d_fp32_fp8_instances.cpp) diff --git a/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_fp8_instances.cpp b/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_fp8_instances.cpp new file mode 100644 index 0000000000..95d83a5439 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/permute_scale/device_permute_scale_6d_fp32_fp8_instances.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/library/tensor_operation_instance/gpu/permute_scale/device_permute_scale_instances.hpp" +#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +using Scale = element_wise::Scale; + +void add_device_permute_scale_6d_f32_f8_instances( + std::vector, ck::Tuple, Scale, 6>>>& + instances) +{ +#ifdef CK_ENABLE_FP8 + add_device_operation_instances(instances, device_permute_scale_f32_f8_instances<6, Scale>{}); +#else + ignore = instances; +#endif +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp index 17f45c3327..0c071e92f5 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp @@ -10,15 +10,24 @@ namespace device { namespace instance { // clang-format off -// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 2, 1, ReduceAMax, UnaryAbs, PassThrough, false, true>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 6, 6, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 5, 5, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 4, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 6, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 5, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 4, 3, ReduceAMax, UnaryAbs, PassThrough, true, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 3, 3, ReduceAMax, PassThrough, PassThrough, true, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 2, 2, ReduceAMax, PassThrough, PassThrough, true, false>(std::vector>&); +template void add_device_reduce_instance_blockwise< F32, F32, F32, 1, 1, ReduceAMax, PassThrough, PassThrough, true, false>(std::vector>&); // clang-format on } // namespace instance