From f504e98b5dbfd6deb1d1409646be4736f5a2a6a0 Mon Sep 17 00:00:00 2001 From: aledudek Date: Tue, 17 Sep 2024 15:57:10 +0200 Subject: [PATCH] Extend pool3d fwd avg, max operations by f8_t, int8_t types (#1483) * Extend pool3d fwd avg, max operations by f8_t, int8_t types * Pack MaxPool3dFwd params together * Fix MaxPool3dFwd AVG instances * Decrease verification precision for bf16 * Adjust tests + review changes * Adjust threshold for F8 * Adjusted compute types for MAX op instances * Fix ComputeDataType mismatch in tests and profiler for AVG * Fix naming from max_pool3d_fwd to pool3d_fwd * Adjust CMakeLists --------- Co-authored-by: Adam Osewski <19374865+aosewski@users.noreply.github.com> [ROCm/composable_kernel commit: a793afc961c9ab3a314ffdcacafe6455181d57e1] --- .../gpu/pool3d_fwd.hpp | 71 +++- .../gpu/pool3d_fwd/CMakeLists.txt | 4 + ...vice_avg_pool3d_fwd_ndhwc_f16_instance.cpp | 2 +- ...evice_avg_pool3d_fwd_ndhwc_f8_instance.cpp | 24 ++ ...evice_avg_pool3d_fwd_ndhwc_i8_instance.cpp | 24 ++ ...ice_max_pool3d_fwd_ndhwc_bf16_instance.cpp | 2 +- ...vice_max_pool3d_fwd_ndhwc_f16_instance.cpp | 2 +- ...evice_max_pool3d_fwd_ndhwc_f8_instance.cpp | 32 ++ ...evice_max_pool3d_fwd_ndhwc_i8_instance.cpp | 32 ++ .../pool3d_fwd/pool_fwd_instance_common.hpp | 4 +- .../profiler/profile_pool3d_fwd_impl.hpp | 121 ++++--- profiler/src/CMakeLists.txt | 2 +- profiler/src/profile_max_pool3d_fwd.cpp | 245 ------------- profiler/src/profile_pool3d_fwd.cpp | 331 ++++++++++++++++++ test/pool/test_avg_pool3d_fwd.cpp | 35 +- test/pool/test_max_pool3d_fwd.cpp | 46 +-- test/pool/test_pool_fwd_common.hpp | 2 + 17 files changed, 620 insertions(+), 359 deletions(-) create mode 100644 library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp create mode 100644 library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp delete mode 100644 profiler/src/profile_max_pool3d_fwd.cpp create mode 100644 profiler/src/profile_pool3d_fwd.cpp diff --git a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp index 94ee68a409..e5e06f9e55 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/pool3d_fwd.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -22,7 +22,7 @@ static constexpr auto WindowRank = 3; static constexpr auto MaxOp = ck::ReduceTensorOp::MAX; static constexpr auto AvgOp = ck::ReduceTensorOp::AVG; -#ifdef CK_ENABLE_FP16 + // FP16 void add_device_pool3d_fwd_ndhwc_f16_instances( std::vector>>&); -#endif -#ifdef CK_ENABLE_BF16 + +using F8 = ck::f8_t; +// F8 +void add_device_pool3d_fwd_ndhwc_f8_instances( + std::vector>>&); + +void add_device_pool3d_fwd_ndhwc_f8_instances( + std::vector>>&); + +// FP8 - return index +void add_device_pool3d_fwd_ndhwc_index_f8_instances( + std::vector>>&); + // BF16 void add_device_pool3d_fwd_ndhwc_bf16_instances( std::vector>>&); -#endif -#ifdef CK_ENABLE_FP32 + // FP32 void add_device_pool3d_fwd_ndhwc_f32_instances( std::vector>>&); -#endif + +// I8 +void add_device_pool3d_fwd_ndhwc_i8_instances( + std::vector>>&); + +void add_device_pool3d_fwd_ndhwc_i8_instances( + std::vector>>&); + +// I8 - return index +void add_device_pool3d_fwd_ndhwc_index_i8_instances( + std::vector>>&); + template > op_ptrs; if constexpr(is_same_v && is_same_v) { -#ifdef CK_ENABLE_FP16 if constexpr(is_same_v && is_same_v && is_same_v) { @@ -112,8 +138,6 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -126,8 +150,6 @@ struct DeviceOperationInstanceFactory && is_same_v && is_same_v) { @@ -140,7 +162,30 @@ struct DeviceOperationInstanceFactory && is_same_v && + is_same_v) + { + if constexpr(OutputIndex && ReduceOpId == MaxOp) + { + add_device_pool3d_fwd_ndhwc_index_f8_instances(op_ptrs); + } + else + { + add_device_pool3d_fwd_ndhwc_f8_instances(op_ptrs); + } + } + else if constexpr(is_same_v && is_same_v && + is_same_v) + { + if constexpr(OutputIndex && ReduceOpId == MaxOp) + { + add_device_pool3d_fwd_ndhwc_index_i8_instances(op_ptrs); + } + else + { + add_device_pool3d_fwd_ndhwc_i8_instances(op_ptrs); + } + } } return op_ptrs; diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt index 533ce89dfd..a40663bf75 100644 --- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/CMakeLists.txt @@ -1,6 +1,10 @@ set(DEVICE_POOL3D_FWD_INSTANCES) list(APPEND DEVICE_POOL3D_FWD_INSTANCES device_avg_pool3d_fwd_ndhwc_f16_instance.cpp device_max_pool3d_fwd_ndhwc_f16_instance.cpp + device_max_pool3d_fwd_ndhwc_f8_instance.cpp + device_avg_pool3d_fwd_ndhwc_f8_instance.cpp + device_max_pool3d_fwd_ndhwc_i8_instance.cpp + device_avg_pool3d_fwd_ndhwc_i8_instance.cpp device_avg_pool3d_fwd_ndhwc_f32_instance.cpp device_max_pool3d_fwd_ndhwc_f32_instance.cpp device_avg_pool3d_fwd_ndhwc_bf16_instance.cpp diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp index 4ebd50bae6..46ef817dd9 100644 --- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "pool_fwd_instance_common.hpp" diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp new file mode 100644 index 0000000000..6069b5a83a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_f8_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; + +void add_device_pool3d_fwd_ndhwc_f8_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool3d_fwd_ndhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp new file mode 100644 index 0000000000..fc286ffccd --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_avg_pool3d_fwd_ndhwc_i8_instance.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; + +void add_device_pool3d_fwd_ndhwc_i8_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool3d_fwd_ndhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp index 5dc504e178..d9da126fcf 100644 --- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_bf16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "pool_fwd_instance_common.hpp" diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp index 46b16bd005..9644fb992d 100644 --- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f16_instance.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "pool_fwd_instance_common.hpp" diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp new file mode 100644 index 0000000000..af31cf8a86 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_f8_instance.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; + +void add_device_pool3d_fwd_ndhwc_f8_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool3d_fwd_ndhwc_instances{}); +} + +void add_device_pool3d_fwd_ndhwc_index_f8_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool3d_fwd_ndhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp new file mode 100644 index 0000000000..78278ccc1f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/device_max_pool3d_fwd_ndhwc_i8_instance.cpp @@ -0,0 +1,32 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved. + +#include "pool_fwd_instance_common.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +static constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; + +void add_device_pool3d_fwd_ndhwc_i8_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool3d_fwd_ndhwc_instances{}); +} + +void add_device_pool3d_fwd_ndhwc_index_i8_instances( + std::vector>>& + instances) +{ + add_device_operation_instances( + instances, device_pool3d_fwd_ndhwc_instances{}); +} + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp b/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp index e8e7813295..e477f1ed73 100644 --- a/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp +++ b/library/src/tensor_operation_instance/gpu/pool3d_fwd/pool_fwd_instance_common.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -15,6 +15,8 @@ namespace tensor_operation { namespace device { namespace instance { +using I8 = int8_t; +using F8 = ck::f8_t; using I32 = int32_t; using F16 = ck::half_t; using BF16 = ck::bhalf_t; diff --git a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp index 02fde48d6e..3bdaa5c838 100644 --- a/profiler/include/profiler/profile_pool3d_fwd_impl.hpp +++ b/profiler/include/profiler/profile_pool3d_fwd_impl.hpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #pragma once @@ -17,6 +17,26 @@ namespace ck { namespace profiler { +struct PoolFwdInputParams +{ + int do_verification; + int init_method; + bool do_log; + bool time_kernel; + bool return_index; + int reduce_op; +}; + +struct PoolFwdKernelParams +{ + std::vector in_length; // NCDHW + std::vector window_spatial_lengths; + std::vector window_strides; + std::vector window_dilations; + std::vector input_left_pads; + std::vector input_right_pads; +}; + template -bool profile_pool3d_fwd_impl(int do_verification, - int init_method, - bool do_log, - bool time_kernel, - std::vector in_length, // NCDHW - std::vector window_spatial_lengths, - std::vector window_strides, - std::vector window_dilations, - std::vector input_left_pads, - std::vector input_right_pads) +bool profile_pool3d_fwd_impl(PoolFwdInputParams& in_params, PoolFwdKernelParams& kernel_params) { constexpr index_t InOutRank = 5; constexpr index_t WindowRank = 3; - if(in_length.size() != InOutRank || window_spatial_lengths.size() != WindowRank || - window_strides.size() != WindowRank || window_dilations.size() != WindowRank || - input_left_pads.size() != WindowRank || input_right_pads.size() != WindowRank) + if(kernel_params.in_length.size() != InOutRank || + kernel_params.window_spatial_lengths.size() != WindowRank || + kernel_params.window_strides.size() != WindowRank || + kernel_params.window_dilations.size() != WindowRank || + kernel_params.input_left_pads.size() != WindowRank || + kernel_params.input_right_pads.size() != WindowRank) return false; std::vector out_length(InOutRank); - int N = in_length[0]; - int C = in_length[1]; + int N = kernel_params.in_length[0]; + int C = kernel_params.in_length[1]; out_length[0] = N; out_length[1] = C; @@ -56,18 +70,18 @@ bool profile_pool3d_fwd_impl(int do_verification, // Calculate Do, Ho, Wo for(int i = 2; i < InOutRank; ++i) { - auto pad1 = input_left_pads[i - 2]; - auto pad2 = input_right_pads[i - 2]; - auto windows_size = window_spatial_lengths[i - 2]; - auto windows_stride = window_strides[i - 2]; - auto windows_dilation = window_dilations[i - 2]; + auto pad1 = kernel_params.input_left_pads[i - 2]; + auto pad2 = kernel_params.input_right_pads[i - 2]; + auto windows_size = kernel_params.window_spatial_lengths[i - 2]; + auto windows_stride = kernel_params.window_strides[i - 2]; + auto windows_dilation = kernel_params.window_dilations[i - 2]; auto eff = (windows_size - 1) * windows_dilation + 1; - out_length[i] = (in_length[i] + pad1 + pad2 - eff) / windows_stride + 1; + out_length[i] = (kernel_params.in_length[i] + pad1 + pad2 - eff) / windows_stride + 1; } - int Di = in_length[2]; - int Hi = in_length[3]; - int Wi = in_length[4]; + int Di = kernel_params.in_length[2]; + int Hi = kernel_params.in_length[3]; + int Wi = kernel_params.in_length[4]; int Do = out_length[2]; int Ho = out_length[3]; int Wo = out_length[4]; @@ -88,7 +102,7 @@ bool profile_pool3d_fwd_impl(int do_verification, Tensor out_indices_n_c_do_ho_wo_device( f_host_tensor_descriptor(N, C, Do, Ho, Wo)); - switch(init_method) + switch(in_params.init_method) { case 0: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_1{}); break; case 1: in_n_c_di_hi_wi.GenerateTensorValue(GeneratorTensor_2{-5, 5}); break; @@ -125,7 +139,7 @@ bool profile_pool3d_fwd_impl(int do_verification, float best_avg_time = std::numeric_limits::max(); float best_gb_per_sec = 0; - if(do_verification) + if(in_params.do_verification) { using ReferenceInstance = ck::tensor_operation::host::ReferencePoolingFwd(in_device_buf.GetDeviceBuffer()), static_cast(out_device_buf.GetDeviceBuffer()), static_cast(out_indices_device_buf.GetDeviceBuffer()), - in_length, - window_spatial_lengths, + kernel_params.in_length, + kernel_params.window_spatial_lengths, out_length, {Di * C * Hi * Wi, 1, C * Hi * Wi, Wi * C, C}, {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C}, {Do * C * Ho * Wo, 1, C * Ho * Wo, Wo * C, C}, - window_strides, - window_dilations, - input_left_pads, - input_right_pads, + kernel_params.window_strides, + kernel_params.window_dilations, + kernel_params.input_left_pads, + kernel_params.input_right_pads, {2, 3, 4}); if(inst_ptr->IsSupportedArgument(argument_ptr.get())) @@ -176,10 +190,11 @@ bool profile_pool3d_fwd_impl(int do_verification, } else { - if(time_kernel) + if(in_params.time_kernel) { std::cout << inst_ptr->GetTypeString() << " skipped due to unsupported argument: "; - LogRange(std::cout << "input lengths = ", in_length, ", ") << std::endl; + LogRange(std::cout << "input lengths = ", kernel_params.in_length, ", ") + << std::endl; } continue; @@ -187,7 +202,8 @@ bool profile_pool3d_fwd_impl(int do_verification, auto invoker_ptr = inst_ptr->MakeInvokerPointer(); - float avg_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, time_kernel}); + float avg_time = + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, in_params.time_kernel}); std::size_t num_bytes = in_n_c_di_hi_wi.mDesc.GetElementSize() * sizeof(InDataType) + out_n_c_do_ho_wo_host.mDesc.GetElementSize() * sizeof(OutDataType); @@ -198,7 +214,7 @@ bool profile_pool3d_fwd_impl(int do_verification, float gb_per_sec = num_bytes / 1.E6 / avg_time; - if(time_kernel) + if(in_params.time_kernel) std::cout << "Perf: " << std::setw(10) << avg_time << " ms, " << gb_per_sec << " GB/s, " << inst_ptr->GetTypeString() << std::endl; @@ -209,25 +225,25 @@ bool profile_pool3d_fwd_impl(int do_verification, best_gb_per_sec = gb_per_sec; } - if(do_verification) + if(in_params.do_verification) { out_device_buf.FromDevice(out_n_c_do_ho_wo_device.mData.data()); - bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData, + auto tolerance = 1e-3; + bool pass = ck::utils::check_err(out_n_c_do_ho_wo_device.mData, out_n_c_do_ho_wo_host.mData, "Error: Incorrect results", - 1e-3, - 1e-3); + tolerance, + tolerance); if constexpr(OutputIndex) { out_indices_device_buf.FromDevice(out_indices_n_c_do_ho_wo_device.mData.data()); - pass = pass && ck::utils::check_err(out_indices_n_c_do_ho_wo_device, out_indices_n_c_do_ho_wo_host); } - if(do_log) + if(in_params.do_log) { LogRangeAsType( std::cout << "in_n_c_di_hi_wi : ", in_n_c_di_hi_wi.mData, ",") @@ -249,20 +265,21 @@ bool profile_pool3d_fwd_impl(int do_verification, if(!pass) { std::cout << inst_ptr->GetTypeString() << " failed verification: "; - LogRange(std::cout << "lengths = [", in_length, ", ") << "]." << std::endl; + LogRange(std::cout << "lengths = [", kernel_params.in_length, ", ") + << "]." << std::endl; return false; } else { - if(time_kernel) + if(in_params.time_kernel) std::cout << "pass" << std::endl; } } } - if(time_kernel) + if(in_params.time_kernel) { - LogRange(std::cout << "length = ", in_length, ",") << std::endl; + LogRange(std::cout << "length = ", kernel_params.in_length, ",") << std::endl; std::cout << "best perf = " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s, " << best_instance_name << std::endl; } diff --git a/profiler/src/CMakeLists.txt b/profiler/src/CMakeLists.txt index 8d91c2b5e0..e9528baeb6 100644 --- a/profiler/src/CMakeLists.txt +++ b/profiler/src/CMakeLists.txt @@ -10,7 +10,7 @@ set(PROFILER_SOURCES profile_groupnorm_bwd_gamma_beta.cpp profile_layernorm_fwd.cpp profile_max_pool2d_fwd.cpp - profile_max_pool3d_fwd.cpp + profile_pool3d_fwd.cpp profile_avg_pool3d_bwd.cpp profile_max_pool3d_bwd.cpp profile_avg_pool2d_bwd.cpp diff --git a/profiler/src/profile_max_pool3d_fwd.cpp b/profiler/src/profile_max_pool3d_fwd.cpp deleted file mode 100644 index 52fdf29fe4..0000000000 --- a/profiler/src/profile_max_pool3d_fwd.cpp +++ /dev/null @@ -1,245 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. - -#include -#include -#include - -#include "profiler/data_type_enum.hpp" -#include "profiler/profile_pool3d_fwd_impl.hpp" -#include "profiler_operation_registry.hpp" - -using ck::index_t; - -struct maxPoolFwdArgParser -{ - std::unordered_map> long_opts = {{"length", {}}, - {"wsize", {}}, - {"wstride", {}}, - {"wdilation", {}}, - {"pad1", {}}, - {"pad2", {}}}; - - bool parse_opt(int argc, char* argv[], const std::string& key, int i) - { - if(std::string("--") + key == argv[i]) - { - int pos = i; - while(++i < argc && argv[i][0] != '-') {} - int end = i; - for(int j = pos + 1; j < end; j++) - { - long_opts[key].push_back(std::stoi(argv[j])); - } - return true; - } - return false; - } - - void operator()(int argc, char* argv[]) - { - for(auto& kv : long_opts) - { - for(int i = 1; i < argc; i++) - { - if(parse_opt(argc, argv, kv.first, i)) - break; - } - } - } -}; - -void print_help_max_pool3d_fwd() -{ - std::cout << "arg1: data type (0: fp16; 1: fp32; 5: bf16)\n" - << "arg2: verification (0: no; 1: yes)\n" - << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" - << "arg4: print tensor value (0: no; 1: yes)\n" - << "arg5: time kernel (0=no, 1=yes)\n" - << "arg6: return index (0=no, 1=yes)\n" - << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n" - << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n" - << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n" - << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n" - << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n" - << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n" - << "eg: ckProfiler max_pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 " - "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1" - << std::endl; -} - -int profile_max_pool3d_fwd(int argc, char* argv[]) -{ - ck::DataTypeEnum data_type = ck::DataTypeEnum::Half; - bool do_verification = true; - int init_method = 0; - bool do_log = false; - bool time_kernel = true; - bool return_index = false; - - std::vector in_length = {2, 32, 30, 30, 30}; - std::vector wsize = {2, 2, 2}; - std::vector wstride = {2, 2, 2}; - std::vector wdilation = {1, 1, 1}; - std::vector pad1 = {1, 1, 1}; - std::vector pad2 = {1, 1, 1}; - - if(argc != 2 && argc != 34) - { - print_help_max_pool3d_fwd(); - return 0; - } - else if(argc == 34) - { - data_type = static_cast(std::stoi(argv[2])); - do_verification = std::stoi(argv[3]); - init_method = std::stoi(argv[4]); - do_log = std::stoi(argv[5]); - time_kernel = std::stoi(argv[6]); - return_index = std::stoi(argv[7]); - - // parse the long options - maxPoolFwdArgParser arg_parser; - arg_parser(argc, argv); - in_length = arg_parser.long_opts["length"]; - wsize = arg_parser.long_opts["wsize"]; - wstride = arg_parser.long_opts["wstride"]; - wdilation = arg_parser.long_opts["wdilation"]; - pad1 = arg_parser.long_opts["pad1"]; - pad2 = arg_parser.long_opts["pad2"]; - } - -#ifdef CK_ENABLE_FP16 - using F16 = ck::half_t; -#endif -#ifdef CK_ENABLE_BF16 - using BF16 = ck::bhalf_t; -#endif -#ifdef CK_ENABLE_FP32 - using F32 = float; -#endif - using I32 = int32_t; - using NDHWC = ck::tensor_layout::convolution::NDHWC; - -#if 1 - constexpr auto ReduceOpId = ck::ReduceTensorOp::MAX; -#else - constexpr auto ReduceOpId = ck::ReduceTensorOp::AVG; -#endif - - if(false) - ; -#ifdef CK_ENABLE_FP16 - else if(data_type == ck::DataTypeEnum::Half) - { - if(return_index) - ck::profiler:: - profile_pool3d_fwd_impl( - do_verification, - init_method, - do_log, - time_kernel, - in_length, - wsize, - wstride, - wdilation, - pad1, - pad2); - else - ck::profiler:: - profile_pool3d_fwd_impl( - do_verification, - init_method, - do_log, - time_kernel, - in_length, - wsize, - wstride, - wdilation, - pad1, - pad2); - } -#endif -#ifdef CK_ENABLE_BF16 - else if(data_type == ck::DataTypeEnum::BFloat16) - { - if(return_index) - ck::profiler::profile_pool3d_fwd_impl(do_verification, - init_method, - do_log, - time_kernel, - in_length, - wsize, - wstride, - wdilation, - pad1, - pad2); - else - ck::profiler::profile_pool3d_fwd_impl(do_verification, - init_method, - do_log, - time_kernel, - in_length, - wsize, - wstride, - wdilation, - pad1, - pad2); - } -#endif -#ifdef CK_ENABLE_FP32 - else if(data_type == ck::DataTypeEnum::Float) - { - if(return_index) - ck::profiler:: - profile_pool3d_fwd_impl( - do_verification, - init_method, - do_log, - time_kernel, - in_length, - wsize, - wstride, - wdilation, - pad1, - pad2); - else - ck::profiler:: - profile_pool3d_fwd_impl( - do_verification, - init_method, - do_log, - time_kernel, - in_length, - wsize, - wstride, - wdilation, - pad1, - pad2); - } -#endif - else - { - throw std::runtime_error("not implemented yet"); - } - - return 0; -} - -REGISTER_PROFILER_OPERATION("max_pool3d_fwd", "max_pool3d fwd", profile_max_pool3d_fwd); diff --git a/profiler/src/profile_pool3d_fwd.cpp b/profiler/src/profile_pool3d_fwd.cpp new file mode 100644 index 0000000000..4ea1fbcf49 --- /dev/null +++ b/profiler/src/profile_pool3d_fwd.cpp @@ -0,0 +1,331 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include + +#include "profiler/data_type_enum.hpp" +#include "profiler/profile_pool3d_fwd_impl.hpp" +#include "profiler_operation_registry.hpp" + +using ck::index_t; + +struct poolFwdArgParser +{ + std::unordered_map> long_opts = {{"length", {}}, + {"wsize", {}}, + {"wstride", {}}, + {"wdilation", {}}, + {"pad1", {}}, + {"pad2", {}}}; + + bool parse_opt(int argc, char* argv[], const std::string& key, int i) + { + if(std::string("--") + key == argv[i]) + { + int pos = i; + while(++i < argc && argv[i][0] != '-') {} + int end = i; + for(int j = pos + 1; j < end; j++) + { + long_opts[key].push_back(std::stoi(argv[j])); + } + return true; + } + return false; + } + + void operator()(int argc, char* argv[]) + { + for(auto& kv : long_opts) + { + for(int i = 1; i < argc; i++) + { + if(parse_opt(argc, argv, kv.first, i)) + break; + } + } + } +}; + +void print_help_pool3d_fwd() +{ + std::cout << "arg1: data type (0: fp16; 1: fp32; 3: int8; 5: bf16; 7: fp8)\n" + << "arg2: verification (0: no; 1: yes)\n" + << "arg3: initialization (0: no init; 1: integer value; 2: decimal value)\n" + << "arg4: print tensor value (0: no; 1: yes)\n" + << "arg5: time kernel (0=no, 1=yes)\n" + << "arg6: return index (0=no, 1=yes)\n" + << "arg7: reduce op (0: max; 1: avg)\n" + << "--length: input tensor length for NCDHW(e.g, --length 2 32 30 30 30) \n" + << "--wsize: window size for ZYX (e.g, --wsize 2 2 2) \n" + << "--wstride: window stride for DHW (e.g, --wstride 2 2 2) \n" + << "--wdilation: window dilation for DHW (e.g, --wdilation 1 1 1) \n" + << "--pad1: left side of padding in DHW (e.g, --pad1 1 1 1) \n" + << "--pad2: right side of padding in DHW (e.g, --pad2 1 1 1) \n" + << "eg: ckProfiler pool3d_fwd 0 1 2 0 1 0 --length 2 32 30 30 30 --wsize 2 2 2 " + "--wstride 2 2 2 --wdilation 1 1 1 --pad1 1 1 1 --pad2 1 1 1" + << std::endl; +} + +int profile_pool3d_fwd(int argc, char* argv[]) +{ + ck::DataTypeEnum data_type = ck::DataTypeEnum::Half; + ck::profiler::PoolFwdInputParams in_params{true, 0, false, true, false, 0}; + ck::profiler::PoolFwdKernelParams kernel_params{ + {2, 32, 30, 30, 30}, {2, 2, 2}, {2, 2, 2}, {1, 1, 1}, {1, 1, 1}, {1, 1, 1}}; + + if(argc != 2 && argc != 35) + { + print_help_pool3d_fwd(); + return 0; + } + else if(argc == 35) + { + data_type = static_cast(std::stoi(argv[2])); + in_params.do_verification = std::stoi(argv[3]); + in_params.init_method = std::stoi(argv[4]); + in_params.do_log = std::stoi(argv[5]); + in_params.time_kernel = std::stoi(argv[6]); + in_params.return_index = std::stoi(argv[7]); + in_params.reduce_op = std::stoi(argv[8]); + + // parse the long options + poolFwdArgParser arg_parser; + arg_parser(argc, argv); + kernel_params.in_length = arg_parser.long_opts["length"]; + kernel_params.window_spatial_lengths = arg_parser.long_opts["wsize"]; + kernel_params.window_strides = arg_parser.long_opts["wstride"]; + kernel_params.window_dilations = arg_parser.long_opts["wdilation"]; + kernel_params.input_left_pads = arg_parser.long_opts["pad1"]; + kernel_params.input_right_pads = arg_parser.long_opts["pad2"]; + } + + using F16 = ck::half_t; + using BF16 = ck::bhalf_t; + using F32 = float; + using I8 = int8_t; + using I32 = int32_t; + using F8 = ck::f8_t; + using NDHWC = ck::tensor_layout::convolution::NDHWC; + + if(data_type == ck::DataTypeEnum::Half) + { + if(in_params.reduce_op == 1) + { + ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { // reduce_op == 0 + if(in_params.return_index) + { + ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { + ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + } + } + else if(data_type == ck::DataTypeEnum::BFloat16) + { + if(in_params.reduce_op == 1) + { + ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { // reduce_op == 0 + if(in_params.return_index) + { + ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { + ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + } + } + else if(data_type == ck::DataTypeEnum::Float) + { + if(in_params.reduce_op == 1) + { + ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { // reduce_op == 0 + if(in_params.return_index) + { + ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { + ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + } + } + else if(data_type == ck::DataTypeEnum::Float8) + { + if(in_params.reduce_op == 1) + { + return ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { // reduce_op == 0 + if(in_params.return_index) + { + return ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { + return ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + } + } + else if(data_type == ck::DataTypeEnum::Int8) + { + if(in_params.reduce_op == 1) + { + return ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { // reduce_op == 0 + if(in_params.return_index) + { + return ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + else + { + return ck::profiler::profile_pool3d_fwd_impl(in_params, kernel_params); + } + } + } + else + { + throw std::runtime_error("not implemented yet"); + } + + return 0; +} + +REGISTER_PROFILER_OPERATION("pool3d_fwd", "pool3d fwd", profile_pool3d_fwd); diff --git a/test/pool/test_avg_pool3d_fwd.cpp b/test/pool/test_avg_pool3d_fwd.cpp index fc196a8a07..378b05399e 100644 --- a/test/pool/test_avg_pool3d_fwd.cpp +++ b/test/pool/test_avg_pool3d_fwd.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "gtest/gtest.h" #include "profiler/profile_pool3d_fwd_impl.hpp" @@ -16,10 +16,19 @@ class TestAvgPool3dFwd : public ::testing::Test std::vector params; + ck::profiler::PoolFwdInputParams in_params_avg_pool{true, 2, false, false, false, 1}; + void Run() { for(auto param : params) { + ck::profiler::PoolFwdKernelParams kernel_params{param.length_, + param.window_spatial_lengths_, + param.window_strides_, + param.window_dilations_, + param.input_left_pads_, + param.input_right_pads_}; + bool success = ck::profiler::profile_pool3d_fwd_impl(true, - 2, - false, - false, - param.length_, - param.window_spatial_lengths_, - param.window_strides_, - param.window_dilations_, - param.input_left_pads_, - param.input_right_pads_); + false>(in_params_avg_pool, kernel_params); EXPECT_TRUE(success); } } }; -#ifdef CK_ENABLE_FP16 -using KernelTypes = - ::testing::Types, std::tuple>; -#else -using KernelTypes = ::testing::Types>; -#endif + +using KernelTypes = ::testing::Types, + std::tuple, + std::tuple, + std::tuple, + std::tuple>; + TYPED_TEST_SUITE(TestAvgPool3dFwd, KernelTypes); TYPED_TEST(TestAvgPool3dFwd, Test_Pool) { diff --git a/test/pool/test_max_pool3d_fwd.cpp b/test/pool/test_max_pool3d_fwd.cpp index 7189f1b104..d7602f9acd 100644 --- a/test/pool/test_max_pool3d_fwd.cpp +++ b/test/pool/test_max_pool3d_fwd.cpp @@ -1,5 +1,5 @@ // SPDX-License-Identifier: MIT -// Copyright (c) 2018-2023, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved. #include "gtest/gtest.h" #include "profiler/profile_pool3d_fwd_impl.hpp" @@ -16,10 +16,20 @@ class TestMaxPool3dFwd : public ::testing::Test std::vector params; + ck::profiler::PoolFwdInputParams in_params_max_pool{true, 2, false, false, false, 0}; + ck::profiler::PoolFwdInputParams in_params_max_pool_indexed{true, 2, false, false, true, 0}; + void Run() { for(auto param : params) { + ck::profiler::PoolFwdKernelParams kernel_params{param.length_, + param.window_spatial_lengths_, + param.window_strides_, + param.window_dilations_, + param.input_left_pads_, + param.input_right_pads_}; + // max pool bool success = ck::profiler::profile_pool3d_fwd_impl(true, - 2, - false, - false, - param.length_, - param.window_spatial_lengths_, - param.window_strides_, - param.window_dilations_, - param.input_left_pads_, - param.input_right_pads_); + false>(in_params_max_pool, kernel_params); EXPECT_TRUE(success); // max pool + index @@ -51,27 +52,18 @@ class TestMaxPool3dFwd : public ::testing::Test ck::tensor_layout::convolution::NDHWC, ck::ReduceTensorOp::MAX, false, - true>(true, - 2, - false, - false, - param.length_, - param.window_spatial_lengths_, - param.window_strides_, - param.window_dilations_, - param.input_left_pads_, - param.input_right_pads_); + true>(in_params_max_pool_indexed, + kernel_params); EXPECT_TRUE(success); } } }; -#ifdef CK_ENABLE_FP16 -using KernelTypes = - ::testing::Types, std::tuple>; -#else -using KernelTypes = ::testing::Types>; -#endif +using KernelTypes = ::testing::Types, + std::tuple, + std::tuple, + std::tuple, + std::tuple>; TYPED_TEST_SUITE(TestMaxPool3dFwd, KernelTypes); TYPED_TEST(TestMaxPool3dFwd, Test_Pool) diff --git a/test/pool/test_pool_fwd_common.hpp b/test/pool/test_pool_fwd_common.hpp index 0302c3c337..b510b2f214 100644 --- a/test/pool/test_pool_fwd_common.hpp +++ b/test/pool/test_pool_fwd_common.hpp @@ -4,6 +4,8 @@ #include "gtest/gtest.h" #include "ck/ck.hpp" +using I8 = int8_t; +using F8 = ck::f8_t; using F16 = ck::half_t; using BF16 = ck::bhalf_t; using F32 = float;