From 2ca8512f487d54a1a4cffa55a40efbc5fbbd31d3 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Tue, 17 Jan 2023 12:18:06 +0800 Subject: [PATCH] Reduction external API and client examples (#493) * Change to the DeviceReduce base class template to include all problem description information * Add external api for reduction * Add client example to test the reduction external api * Spelling correction * Re-implement the host_reduction to follow the DeviceReduce base API format * Change the reduce profiler to call the external API for collecting device instances * Rename reduce client example directory from 08_reduce to 12_reduce * Remove (void) before the functional call * Tiny update in reduce client example * Tiny update in profile_reduce_impl.hpp * Rename the reduce client example directory Co-authored-by: Po Yen Chen [ROCm/composable_kernel commit: 80e05267417f948e4f7e63c0fe807106d9a0c0ef] --- client_example/15_reduce/CMakeLists.txt | 2 + client_example/15_reduce/reduce_nhwc_c.cpp | 175 +++++++ example/12_reduce/reduce_blockwise_impl.hpp | 72 +-- .../12_reduce/reduce_blockwise_two_call.cpp | 76 +-- .../reduce_multiblock_atomic_add_impl.hpp | 72 +-- .../gpu/device/device_reduce.hpp | 32 +- .../device_multiple_reduce_multiblock.hpp | 4 +- .../device/impl/device_reduce_multiblock.hpp | 16 +- .../device/impl/device_reduce_threadwise.hpp | 13 +- include/ck/utility/reduction_operator.hpp | 10 +- .../cpu/reference_reduce.hpp | 435 ++++++++++++++++++ .../device_reduce_instance_blockwise.hpp | 12 +- ...uce_instance_blockwise_b16_f32_b16_add.hpp | 8 +- ...ce_instance_blockwise_b16_f32_b16_amax.hpp | 16 +- ...uce_instance_blockwise_b16_f32_b16_avg.hpp | 8 +- ...uce_instance_blockwise_b16_f32_b16_max.hpp | 16 +- ...uce_instance_blockwise_b16_f32_b16_min.hpp | 16 +- ...e_instance_blockwise_b16_f32_b16_norm2.hpp | 8 +- ...ce_instance_blockwise_f16_f16_f16_amax.hpp | 16 +- ...uce_instance_blockwise_f16_f16_f16_max.hpp | 16 +- ...uce_instance_blockwise_f16_f16_f16_min.hpp | 16 +- ...uce_instance_blockwise_f16_f32_f16_add.hpp | 8 +- ...uce_instance_blockwise_f16_f32_f16_avg.hpp | 8 +- ...e_instance_blockwise_f16_f32_f16_norm2.hpp | 8 +- ...uce_instance_blockwise_f32_f32_f32_add.hpp | 8 +- ...ce_instance_blockwise_f32_f32_f32_amax.hpp | 16 +- ...uce_instance_blockwise_f32_f32_f32_avg.hpp | 8 +- ...uce_instance_blockwise_f32_f32_f32_max.hpp | 16 +- ...uce_instance_blockwise_f32_f32_f32_min.hpp | 16 +- ...e_instance_blockwise_f32_f32_f32_norm2.hpp | 8 +- ...uce_instance_blockwise_f32_f64_f32_add.hpp | 8 +- ...uce_instance_blockwise_f32_f64_f32_avg.hpp | 8 +- ...e_instance_blockwise_f32_f64_f32_norm2.hpp | 8 +- ...uce_instance_blockwise_f64_f64_f64_add.hpp | 8 +- ...ce_instance_blockwise_f64_f64_f64_amax.hpp | 16 +- ...uce_instance_blockwise_f64_f64_f64_avg.hpp | 8 +- ...uce_instance_blockwise_f64_f64_f64_max.hpp | 16 +- ...uce_instance_blockwise_f64_f64_f64_min.hpp | 16 +- ...e_instance_blockwise_f64_f64_f64_norm2.hpp | 8 +- ...educe_instance_blockwise_i8_i32_i8_add.hpp | 8 +- ...educe_instance_blockwise_i8_i32_i8_avg.hpp | 8 +- ...educe_instance_blockwise_i8_i8_i8_amax.hpp | 16 +- ...reduce_instance_blockwise_i8_i8_i8_max.hpp | 16 +- ...reduce_instance_blockwise_i8_i8_i8_min.hpp | 16 +- ..._reduce_instance_multiblock_atomic_add.hpp | 12 +- ..._multiblock_atomic_add_b16_f32_f32_add.hpp | 8 +- ..._multiblock_atomic_add_b16_f32_f32_avg.hpp | 8 +- ..._multiblock_atomic_add_f16_f32_f32_add.hpp | 8 +- ..._multiblock_atomic_add_f16_f32_f32_avg.hpp | 8 +- ..._multiblock_atomic_add_f32_f32_f32_add.hpp | 8 +- ..._multiblock_atomic_add_f32_f32_f32_avg.hpp | 8 +- ..._multiblock_atomic_add_f32_f64_f32_add.hpp | 8 +- ..._multiblock_atomic_add_f32_f64_f32_avg.hpp | 8 +- ..._multiblock_atomic_add_f64_f64_f64_add.hpp | 8 +- ..._multiblock_atomic_add_f64_f64_f64_avg.hpp | 8 +- .../device_reduce_instance_threadwise.hpp | 12 +- ...ce_instance_threadwise_b16_f32_b16_add.hpp | 8 +- ...e_instance_threadwise_b16_f32_b16_amax.hpp | 16 +- ...ce_instance_threadwise_b16_f32_b16_avg.hpp | 8 +- ...ce_instance_threadwise_b16_f32_b16_max.hpp | 16 +- ...ce_instance_threadwise_b16_f32_b16_min.hpp | 16 +- ..._instance_threadwise_b16_f32_b16_norm2.hpp | 8 +- ...e_instance_threadwise_f16_f16_f16_amax.hpp | 16 +- ...ce_instance_threadwise_f16_f16_f16_max.hpp | 16 +- ...ce_instance_threadwise_f16_f16_f16_min.hpp | 16 +- ...ce_instance_threadwise_f16_f32_f16_add.hpp | 8 +- ...ce_instance_threadwise_f16_f32_f16_avg.hpp | 8 +- ..._instance_threadwise_f16_f32_f16_norm2.hpp | 8 +- ...ce_instance_threadwise_f32_f32_f32_add.hpp | 8 +- ...e_instance_threadwise_f32_f32_f32_amax.hpp | 16 +- ...ce_instance_threadwise_f32_f32_f32_avg.hpp | 8 +- ...ce_instance_threadwise_f32_f32_f32_max.hpp | 16 +- ...ce_instance_threadwise_f32_f32_f32_min.hpp | 16 +- ..._instance_threadwise_f32_f32_f32_norm2.hpp | 8 +- ...ce_instance_threadwise_f32_f64_f32_add.hpp | 8 +- ...ce_instance_threadwise_f32_f64_f32_avg.hpp | 8 +- ..._instance_threadwise_f32_f64_f32_norm2.hpp | 8 +- ...ce_instance_threadwise_f64_f64_f64_add.hpp | 8 +- ...e_instance_threadwise_f64_f64_f64_amax.hpp | 16 +- ...ce_instance_threadwise_f64_f64_f64_avg.hpp | 8 +- ...ce_instance_threadwise_f64_f64_f64_max.hpp | 16 +- ...ce_instance_threadwise_f64_f64_f64_min.hpp | 16 +- ..._instance_threadwise_f64_f64_f64_norm2.hpp | 8 +- ...duce_instance_threadwise_i8_i32_i8_add.hpp | 8 +- ...duce_instance_threadwise_i8_i32_i8_avg.hpp | 8 +- ...duce_instance_threadwise_i8_i8_i8_amax.hpp | 16 +- ...educe_instance_threadwise_i8_i8_i8_max.hpp | 16 +- ...educe_instance_threadwise_i8_i8_i8_min.hpp | 16 +- .../gpu/reduce/reduce.hpp | 117 +++++ .../ck/library/utility/host_reduction.hpp | 374 --------------- ...uce_instance_blockwise_b16_f32_b16_add.cpp | 8 +- ...ce_instance_blockwise_b16_f32_b16_amax.cpp | 16 +- ...uce_instance_blockwise_b16_f32_b16_avg.cpp | 8 +- ...uce_instance_blockwise_b16_f32_b16_max.cpp | 16 +- ...uce_instance_blockwise_b16_f32_b16_min.cpp | 16 +- ...e_instance_blockwise_b16_f32_b16_norm2.cpp | 8 +- ...ce_instance_blockwise_f16_f16_f16_amax.cpp | 16 +- ...uce_instance_blockwise_f16_f16_f16_max.cpp | 16 +- ...uce_instance_blockwise_f16_f16_f16_min.cpp | 16 +- ...uce_instance_blockwise_f16_f32_f16_add.cpp | 8 +- ...uce_instance_blockwise_f16_f32_f16_avg.cpp | 8 +- ...e_instance_blockwise_f16_f32_f16_norm2.cpp | 8 +- ...uce_instance_blockwise_f32_f32_f32_add.cpp | 8 +- ...ce_instance_blockwise_f32_f32_f32_amax.cpp | 16 +- ...uce_instance_blockwise_f32_f32_f32_avg.cpp | 8 +- ...uce_instance_blockwise_f32_f32_f32_max.cpp | 16 +- ...uce_instance_blockwise_f32_f32_f32_min.cpp | 16 +- ...e_instance_blockwise_f32_f32_f32_norm2.cpp | 8 +- ...uce_instance_blockwise_f32_f64_f32_add.cpp | 8 +- ...uce_instance_blockwise_f32_f64_f32_avg.cpp | 8 +- ...e_instance_blockwise_f32_f64_f32_norm2.cpp | 8 +- ...uce_instance_blockwise_f64_f64_f64_add.cpp | 8 +- ...ce_instance_blockwise_f64_f64_f64_amax.cpp | 16 +- ...uce_instance_blockwise_f64_f64_f64_avg.cpp | 8 +- ...uce_instance_blockwise_f64_f64_f64_max.cpp | 16 +- ...uce_instance_blockwise_f64_f64_f64_min.cpp | 16 +- ...e_instance_blockwise_f64_f64_f64_norm2.cpp | 8 +- ...educe_instance_blockwise_i8_i32_i8_add.cpp | 8 +- ...educe_instance_blockwise_i8_i32_i8_avg.cpp | 8 +- ...educe_instance_blockwise_i8_i8_i8_amax.cpp | 16 +- ...reduce_instance_blockwise_i8_i8_i8_max.cpp | 16 +- ...reduce_instance_blockwise_i8_i8_i8_min.cpp | 16 +- ..._multiblock_atomic_add_b16_f32_f32_add.cpp | 8 +- ..._multiblock_atomic_add_b16_f32_f32_avg.cpp | 8 +- ..._multiblock_atomic_add_f16_f32_f32_add.cpp | 8 +- ..._multiblock_atomic_add_f16_f32_f32_avg.cpp | 8 +- ..._multiblock_atomic_add_f32_f32_f32_add.cpp | 8 +- ..._multiblock_atomic_add_f32_f32_f32_avg.cpp | 8 +- ..._multiblock_atomic_add_f32_f64_f32_add.cpp | 8 +- ..._multiblock_atomic_add_f32_f64_f32_avg.cpp | 8 +- ..._multiblock_atomic_add_f64_f64_f64_add.cpp | 8 +- ..._multiblock_atomic_add_f64_f64_f64_avg.cpp | 8 +- ...ce_instance_threadwise_b16_f32_b16_add.cpp | 8 +- ...e_instance_threadwise_b16_f32_b16_amax.cpp | 16 +- ...ce_instance_threadwise_b16_f32_b16_avg.cpp | 8 +- ...ce_instance_threadwise_b16_f32_b16_max.cpp | 16 +- ...ce_instance_threadwise_b16_f32_b16_min.cpp | 16 +- ..._instance_threadwise_b16_f32_b16_norm2.cpp | 8 +- ...e_instance_threadwise_f16_f16_f16_amax.cpp | 16 +- ...ce_instance_threadwise_f16_f16_f16_max.cpp | 16 +- ...ce_instance_threadwise_f16_f16_f16_min.cpp | 16 +- ...ce_instance_threadwise_f16_f32_f16_add.cpp | 8 +- ...ce_instance_threadwise_f16_f32_f16_avg.cpp | 8 +- ..._instance_threadwise_f16_f32_f16_norm2.cpp | 8 +- ...ce_instance_threadwise_f32_f32_f32_add.cpp | 8 +- ...e_instance_threadwise_f32_f32_f32_amax.cpp | 16 +- ...ce_instance_threadwise_f32_f32_f32_avg.cpp | 8 +- ...ce_instance_threadwise_f32_f32_f32_max.cpp | 16 +- ...ce_instance_threadwise_f32_f32_f32_min.cpp | 16 +- ..._instance_threadwise_f32_f32_f32_norm2.cpp | 8 +- ...ce_instance_threadwise_f32_f64_f32_add.cpp | 8 +- ...ce_instance_threadwise_f32_f64_f32_avg.cpp | 8 +- ..._instance_threadwise_f32_f64_f32_norm2.cpp | 8 +- ...ce_instance_threadwise_f64_f64_f64_add.cpp | 8 +- ...e_instance_threadwise_f64_f64_f64_amax.cpp | 16 +- ...ce_instance_threadwise_f64_f64_f64_avg.cpp | 8 +- ...ce_instance_threadwise_f64_f64_f64_max.cpp | 16 +- ...ce_instance_threadwise_f64_f64_f64_min.cpp | 16 +- ..._instance_threadwise_f64_f64_f64_norm2.cpp | 8 +- ...duce_instance_threadwise_i8_i32_i8_add.cpp | 8 +- ...duce_instance_threadwise_i8_i32_i8_avg.cpp | 8 +- ...duce_instance_threadwise_i8_i8_i8_amax.cpp | 16 +- ...educe_instance_threadwise_i8_i8_i8_max.cpp | 16 +- ...educe_instance_threadwise_i8_i8_i8_min.cpp | 16 +- .../include/profiler/profile_reduce_impl.hpp | 145 +++--- 165 files changed, 1855 insertions(+), 1388 deletions(-) create mode 100644 client_example/15_reduce/CMakeLists.txt create mode 100644 client_example/15_reduce/reduce_nhwc_c.cpp create mode 100644 library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp delete mode 100644 library/include/ck/library/utility/host_reduction.hpp diff --git a/client_example/15_reduce/CMakeLists.txt b/client_example/15_reduce/CMakeLists.txt new file mode 100644 index 0000000000..d52675ba83 --- /dev/null +++ b/client_example/15_reduce/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(client_reduce_nhwc_c reduce_nhwc_c.cpp) +target_link_libraries(client_reduce_nhwc_c PRIVATE composable_kernel::device_operations) diff --git a/client_example/15_reduce/reduce_nhwc_c.cpp b/client_example/15_reduce/reduce_nhwc_c.cpp new file mode 100644 index 0000000000..8f4902ae25 --- /dev/null +++ b/client_example/15_reduce/reduce_nhwc_c.cpp @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp" + +using InDataType = float; +using OutDataType = float; +using AccDataType = float; +using ReduceAdd = ck::reduce::Add; +using PassThrough = ck::tensor_operation::element_wise::PassThrough; +using UnaryDivide = ck::tensor_operation::element_wise::UnaryDivide; + +constexpr bool PropagateNan = false; +constexpr bool OutputIndex = false; + +constexpr int Rank = 4; +constexpr int NumReduceDim = 3; + +struct SimpleDeviceMem +{ + SimpleDeviceMem() = delete; + + SimpleDeviceMem(std::size_t mem_size) : p_mem_{} + { + (void)hipMalloc(static_cast(&p_mem_), mem_size); + } + + void* GetDeviceBuffer() { return p_mem_; } + + ~SimpleDeviceMem() { (void)hipFree(p_mem_); } + + void* p_mem_; +}; + +int main(int argc, char* argv[]) +{ + std::array in_lengths{16, 8, 128, 256}; + std::array in_strides{8 * 128 * 256, 128 * 256, 256, 1}; + std::array out_lengths{256}; + std::array out_strides{1}; + std::array reduce_dims{0, 1, 2}; + + ck::index_t num_in_elements = + std::accumulate(in_lengths.begin(), in_lengths.end(), 1, std::multiplies()); + + ck::index_t num_out_elements = + std::accumulate(out_lengths.begin(), out_lengths.end(), 1, std::multiplies()); + + ck::index_t reduce_length = 1; + + for(auto dim : reduce_dims) + reduce_length *= in_lengths[dim]; + + float alpha{1.0f}; + float beta{0.0f}; + + SimpleDeviceMem in(sizeof(InDataType) * num_in_elements); + SimpleDeviceMem out(sizeof(OutDataType) * num_out_elements); + + using DeviceOp = ck::tensor_operation::device::DeviceReduce; + const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + DeviceOp>::GetInstances(); + + std::cout << "found " << op_ptrs.size() << " instances" << std::endl; + + std::string best_op_name; + bool found = false; + int best_op_id = -1; + float best_ave_time = std::numeric_limits::max(); + float best_gb_per_sec = 0; + + // profile device operation instances + std::cout << "Run all instances and do timing" << std::endl; + + for(int i = 0; i < op_ptrs.size(); ++i) + { + auto& op_ptr = op_ptrs[i]; + + auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + out_lengths, + out_strides, + reduce_dims, + alpha, + beta, + in.GetDeviceBuffer(), + nullptr, + out.GetDeviceBuffer(), + nullptr, + PassThrough{}, + UnaryDivide{reduce_length}); + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + std::string op_name = op_ptr->GetTypeString(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true}); + + std::size_t num_bytes = num_in_elements * sizeof(InDataType) + + (beta == 0.0f ? 1 : 2) * num_out_elements * sizeof(OutDataType); + + float gb_per_sec = num_bytes / 1.E6 / ave_time; + + std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, " + << op_name << std::endl; + + if(ave_time < best_ave_time) + { + found = true; + best_op_id = i; + best_op_name = op_name; + best_ave_time = ave_time; + best_gb_per_sec = gb_per_sec; + } + } + else + { + std::cout << op_name << " does not support this problem" << std::endl; + } + } + + std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, " + << best_op_name << std::endl; + + // run the best intance + if(found) + { + auto& op_ptr = op_ptrs[best_op_id]; + std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString() + << std::endl; + auto argument_ptr = op_ptr->MakeArgumentPointer(in_lengths, + in_strides, + out_lengths, + out_strides, + reduce_dims, + alpha, + beta, + in.GetDeviceBuffer(), + nullptr, + out.GetDeviceBuffer(), + nullptr, + PassThrough{}, + UnaryDivide{reduce_length}); + + auto invoker_ptr = op_ptr->MakeInvokerPointer(); + + if(op_ptr->IsSupportedArgument(argument_ptr.get())) + { + invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false}); + } + + std::cout << "Done" << std::endl; + } + + return 0; +} diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp index 7bafd2d2bb..6df549448d 100644 --- a/example/12_reduce/reduce_blockwise_impl.hpp +++ b/example/12_reduce/reduce_blockwise_impl.hpp @@ -9,6 +9,7 @@ #include "ck/utility/reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp" #include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" @@ -16,7 +17,6 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_reduction.hpp" #include "reduce_example_common.hpp" @@ -236,29 +236,6 @@ int reduce_blockwise_impl(bool do_verification, reduce_unary_operator::GetElementwiseOperator( static_cast(reduce_total_length)); - if(do_verification) - { - ReductionHost - hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - - hostReduce.Run(alpha, - in.mData.data(), - beta, - out_ref.mData.data(), - out_indices_ref.mData.data(), - in_elementwise_op, - acc_elementwise_op); - }; - std::array arrInLengths; std::array arrInStrides; std::array arrOutLengths; @@ -269,6 +246,48 @@ int reduce_blockwise_impl(bool do_verification, ck::ranges::copy(outLengths, arrOutLengths.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin()); + if(do_verification) + { + using ReferenceReduceInstance = + ck::tensor_operation::host::ReferenceReduce; + + auto reduce_ref = ReferenceReduceInstance{}; + + auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths, + arrInStrides, + arrOutLengths, + arrOutStrides, + reduceDims, + alpha, + beta, + in.mData.data(), + nullptr, + out_ref.mData.data(), + out_indices_ref.mData.data(), + in_elementwise_op, + acc_elementwise_op); + + if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cout << "The runtime parameters not supported by the reduce reference, exiting!" + << std::endl; + return (false); + }; + + auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer(); + + invoker_ptr_ref->Run(argument_ptr_ref.get()); + }; + auto reduce = DeviceReduceInstance{}; auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths, @@ -287,9 +306,8 @@ int reduce_blockwise_impl(bool do_verification, if(!reduce.IsSupportedArgument(argument_ptr.get())) { - std::cerr - << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" - << std::endl; + std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!" + << std::endl; return (-2); }; diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp index 39821f240a..a86ea7b56a 100644 --- a/example/12_reduce/reduce_blockwise_two_call.cpp +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -12,13 +12,13 @@ #include "ck/utility/reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_reduction.hpp" using namespace ck; using namespace ck::tensor_operation::device; @@ -97,8 +97,8 @@ int main(int argc, char* argv[]) // const std::array invariantDims_2 = {0, 1, 2}; // used by the host reduction - const std::array reduceDims = {3, 4}; - const std::array invariantDims = {0, 1, 2}; + const std::array reduceDims = {3, 4}; + // const std::array invariantDims = {0, 1, 2}; const std::vector inLengths_1 = {64, 320, 80, 4, 128}; @@ -191,29 +191,6 @@ int main(int argc, char* argv[]) reduce_unary_operator::GetElementwiseOperator( static_cast(reduce_total_length)); - if(do_verify) - { - ReductionHost - hostReduce(in_1.mDesc, out_ref.mDesc, invariantDims, reduceDims); - - hostReduce.Run(alpha, - in_1.mData.data(), - beta, - out_ref.mData.data(), - nullptr, - in_elementwise_op, - acc_elementwise_op); - }; - std::array arrInLengths_1; std::array arrInStrides_1; std::array arrInLengths_2; @@ -228,6 +205,48 @@ int main(int argc, char* argv[]) ck::ranges::copy(outLengths, arrOutLengths.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin()); + if(do_verify) + { + using ReferenceReduceInstance = + ck::tensor_operation::host::ReferenceReduce; + + auto reduce_ref = ReferenceReduceInstance{}; + + auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths_1, + arrInStrides_1, + arrOutLengths, + arrOutStrides, + reduceDims, + alpha, + beta, + in_1.mData.data(), + nullptr, + out_ref.mData.data(), + nullptr, + in_elementwise_op, + acc_elementwise_op); + + if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cout << "The runtime parameters not supported by the reduce reference, exiting!" + << std::endl; + return (false); + }; + + auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer(); + + invoker_ptr_ref->Run(argument_ptr_ref.get()); + }; + auto reduce_1 = DeviceReduceInstance_1{}; auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1, @@ -246,9 +265,8 @@ int main(int argc, char* argv[]) if(!reduce_1.IsSupportedArgument(argument_ptr_1.get())) { - std::cout - << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" - << std::endl; + std::cout << "The runtime parameters seems supported by the DeviceReduce instance, exiting!" + << std::endl; }; auto invoker_ptr_1 = reduce_1.MakeInvokerPointer(); diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp index 94867aee41..100a20d2a2 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp +++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp @@ -9,6 +9,7 @@ #include "ck/utility/reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp" #include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" @@ -16,7 +17,6 @@ #include "ck/library/utility/host_tensor.hpp" #include "ck/library/utility/host_tensor_generator.hpp" #include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_reduction.hpp" #include "reduce_example_common.hpp" @@ -149,29 +149,6 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, reduce_unary_operator::GetElementwiseOperator( static_cast(reduce_total_length)); - if(do_verification) - { - ReductionHost - hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - - hostReduce.Run(alpha, - in.mData.data(), - beta, - out_ref.mData.data(), - nullptr, - in_elementwise_op, - acc_elementwise_op); - }; - std::array arrInLengths; std::array arrInStrides; std::array arrOutLengths; @@ -182,6 +159,48 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, ck::ranges::copy(outLengths, arrOutLengths.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin()); + if(do_verification) + { + using ReferenceReduceInstance = + ck::tensor_operation::host::ReferenceReduce; + + auto reduce_ref = ReferenceReduceInstance{}; + + auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths, + arrInStrides, + arrOutLengths, + arrOutStrides, + reduceDims, + alpha, + beta, + in.mData.data(), + nullptr, + out_ref.mData.data(), + nullptr, + in_elementwise_op, + acc_elementwise_op); + + if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cout << "The runtime parameters not supported by the reduce reference, exiting!" + << std::endl; + return (false); + }; + + auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer(); + + invoker_ptr_ref->Run(argument_ptr_ref.get()); + }; + auto reduce = DeviceReduceInstance{}; auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths, @@ -200,9 +219,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, if(!reduce.IsSupportedArgument(argument_ptr.get())) { - std::cerr - << "The runtime parameters seems not supported by the DeviceReduce instance, exiting!" - << std::endl; + std::cerr << "The runtime parameters not supported by the DeviceReduce instance, exiting!" + << std::endl; return (-2); }; diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp index 15aeb8e91c..531d0d0f81 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp @@ -13,10 +13,16 @@ namespace ck { namespace tensor_operation { namespace device { -template + typename AccElementwiseOperation, + bool PropagateNan, + bool OutputIndex> struct DeviceReduce : public BaseOperator { static constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim; @@ -39,12 +45,26 @@ struct DeviceReduce : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template -using DeviceReducePtr = std::unique_ptr< - DeviceReduce>; + typename AccElementwiseOperation, + bool PropagateNan, + bool OutputIndex> +using DeviceReducePtr = std::unique_ptr>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp index dbeeb980a5..6b730b1265 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_multiple_reduce_multiblock.hpp @@ -73,8 +73,8 @@ struct DeviceMultipleReduceMultiBlock : public DeviceMultipleReduce{}([&](auto I) { using OutDataType = remove_cvref_t; flag = - flag && ck::reduce::InMemoryDataOperatonSupportedOnDataType::value; + flag && ck::reduce::InMemoryDataOperationSupportedOnDataType::value; }); return flag; diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp index 93855eb33e..8abe8884a1 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp @@ -40,8 +40,16 @@ template -struct DeviceReduceMultiBlock - : public DeviceReduce +struct DeviceReduceMultiBlock : public DeviceReduce { static_assert(Rank <= 6, "Bigger Rank size is not supported!"); static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, @@ -67,8 +75,8 @@ struct DeviceReduceMultiBlock static constexpr bool use_multiblock = (OutMemoryDataOperation == InMemoryDataOperationEnum::AtomicAdd); - static_assert(ck::reduce::InMemoryDataOperatonSupportedOnDataType::value, + static_assert(ck::reduce::InMemoryDataOperationSupportedOnDataType::value, "The OutDataType must support the specified OutMemoryDataOperation!"); static_assert(!use_multiblock || (use_multiblock && !OutputIndex), diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp index 05e14f080e..888485228a 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp @@ -35,8 +35,17 @@ template -struct DeviceReduceThreadWise - : public DeviceReduce +struct DeviceReduceThreadWise : public DeviceReduce + { static_assert(Rank <= 6, "Bigger Rank size is not supported!"); diff --git a/include/ck/utility/reduction_operator.hpp b/include/ck/utility/reduction_operator.hpp index 25ae8fd34f..b4e770a64e 100644 --- a/include/ck/utility/reduction_operator.hpp +++ b/include/ck/utility/reduction_operator.hpp @@ -251,27 +251,27 @@ constexpr T GetIdentityValueForInMemoryDataOperation(InMemoryDataOperationEnum o }; template -struct InMemoryDataOperatonSupportedOnDataType +struct InMemoryDataOperationSupportedOnDataType { static constexpr bool value = false; }; template -struct InMemoryDataOperatonSupportedOnDataType +struct InMemoryDataOperationSupportedOnDataType { static constexpr bool value = is_same::value || is_same::value; }; template -struct InMemoryDataOperatonSupportedOnDataType +struct InMemoryDataOperationSupportedOnDataType { static constexpr bool value = is_same::value || is_same::value; }; template -struct InMemoryDataOperatonSupportedOnDataType +struct InMemoryDataOperationSupportedOnDataType { static constexpr bool value = is_same::value || is_same::value || @@ -280,7 +280,7 @@ struct InMemoryDataOperatonSupportedOnDataType -struct InMemoryDataOperatonSupportedOnDataType +struct InMemoryDataOperationSupportedOnDataType { static constexpr bool value = is_same::value || is_same::value || diff --git a/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp b/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp new file mode 100644 index 0000000000..c83523f0d1 --- /dev/null +++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_reduce.hpp @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include +#include +#include +#include + +#include "ck/ck.hpp" +#include "ck/utility/ignore.hpp" +#include "ck/utility/reduction_common.hpp" +#include "ck/utility/reduction_functions_accumulate.hpp" +#include "ck/library/utility/host_common_util.hpp" +#include "ck/library/utility/host_tensor.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" + +namespace ck { +namespace tensor_operation { +namespace host { + +template +struct ReferenceReduce : public device::DeviceReduce +{ + using IndexDataType = int32_t; + + static constexpr int NumInvariantDim = Rank - NumReduceDim; + + static constexpr index_t NumSrcDim = Rank; + static constexpr index_t NumDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); + + struct Argument : public device::BaseArgument + { + Argument(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, + float alpha, + float beta, + const InDataType* in_host, + OutDataType* out_host, + IndexDataType* out_index_host, + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) + : reduceDims_(reduceDims), + outLengths_(outLengths), + outStrides_(outStrides), + in_host_(in_host), + out_host_(out_host), + out_index_host_(out_index_host), + in_elementwise_op_(in_elementwise_op), + acc_elementwise_op_(acc_elementwise_op) + { + using ck::host_common::get_index_set; + + if(std::any_of( + reduceDims.begin(), reduceDims.end(), [](int d) { return d < 0 || d >= Rank; })) + throw std::runtime_error("Invalid reduce dimensions!"); + + if constexpr(NumInvariantDim > 0) + { + // get invariant_dims[] and invariant_lengths[] + for(int dim = 0, i = 0; dim < Rank; dim++) + if(std::none_of( + reduceDims.begin(), reduceDims.end(), [&](int d) { return d == dim; })) + { + invariantDims_[i] = dim; + invariant_lengths_[i] = inLengths[dim]; + i++; + }; + }; + + // get reduce_lengths_[] + for(int j = 0, i = 0; j < NumReduceDim; j++) + { + int dim = reduceDims[j]; + reduce_lengths_[i++] = inLengths[dim]; + }; + + if constexpr(NumInvariantDim > 0) + { + // check invariant_lengths_ and outLengths + for(int i = 0; i < NumInvariantDim; i++) + if(invariant_lengths_[i] != outLengths_[i]) + throw std::runtime_error("Invalid lengths parameters!"); + } + + if constexpr(NumInvariantDim > 0) + { + for(int j = 0, i = 0; j < NumInvariantDim; j++) + { + int dim = invariantDims_[j]; + in_invariant_strides_[i] = inStrides[dim]; + i++; + }; + }; + + for(int j = 0, i = 0; j < NumReduceDim; j++) + { + int dim = reduceDims_[j]; + in_reduce_strides_[i] = inStrides[dim]; + i++; + }; + + if constexpr(NumInvariantDim > 0) + invariant_index_set_ = get_index_set(invariant_lengths_); + + reduce_index_set_ = get_index_set(reduce_lengths_); + + alpha_ = type_convert(alpha); + beta_ = type_convert(beta); + }; + + const std::array reduceDims_; + std::array invariantDims_; + std::array invariant_lengths_; + std::array reduce_lengths_; + + const std::array outLengths_; + const std::array outStrides_; + + std::array in_invariant_strides_; + std::array in_reduce_strides_; + + const InDataType* in_host_; + OutDataType* out_host_; + IndexDataType* out_index_host_; + const InElementwiseOperation in_elementwise_op_; + const AccElementwiseOperation acc_elementwise_op_; + + AccDataType alpha_; + AccDataType beta_; + + std::vector> invariant_index_set_; + std::vector> reduce_index_set_; + }; + + struct Invoker : public device::BaseInvoker + { + float Run(const Argument& arg, const StreamConfig& stream_config = StreamConfig{}) + { + ignore = stream_config; + + using ck::float_equal_one; + using ck::float_equal_zero; + using ck::type_convert; + using ck::host_common::get_index_set; + using ck::host_common::get_offset_from_index; + + if constexpr(OutputIndex) + { + using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck; + + if constexpr(NumInvariantDim == 0) + { + AccDataType accuVal = ReduceOperation::template GetIdentityValue(); + IndexDataType accuIndex = 0; + + for(std::size_t i = 0; i < arg.reduce_index_set_.size(); i++) + { + auto in_offset = get_offset_from_index( + arg.in_reduce_strides_, arg.reduce_index_set_[i]); + + auto currVal = type_convert(arg.in_host_[in_offset]); + + arg.in_elementwise_op_(currVal, currVal); + + auto currIndex = static_cast(i); + + Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex); + }; + + arg.acc_elementwise_op_(accuVal, accuVal); + + if(!float_equal_one{}(arg.alpha_)) + accuVal *= type_convert(arg.alpha_); + + if(!float_equal_zero{}(arg.beta_)) + accuVal += type_convert(arg.out_host_[0]) * + type_convert(arg.beta_); + + arg.out_host_[0] = type_convert(accuVal); + arg.out_index_host_[0] = accuIndex; + } + else + { + auto thread_reduce_func = [&](auto invariant_index) { + AccDataType accuVal = + ReduceOperation::template GetIdentityValue(); + IndexDataType accuIndex = 0; + + auto in_invariant_offset = get_offset_from_index( + arg.in_invariant_strides_, invariant_index); + + for(std::size_t i = 0; i < arg.reduce_index_set_.size(); i++) + { + auto in_reduce_offset = get_offset_from_index( + arg.in_reduce_strides_, arg.reduce_index_set_[i]); + + auto currVal = type_convert( + arg.in_host_[in_invariant_offset + in_reduce_offset]); + + arg.in_elementwise_op_(currVal, currVal); + + auto currIndex = static_cast(i); + + Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex); + }; + + arg.acc_elementwise_op_(accuVal, accuVal); + + if(!float_equal_one{}(arg.alpha_)) + accuVal *= type_convert(arg.alpha_); + + auto dst_offset = get_offset_from_index(arg.outStrides_, + invariant_index); + + if(!float_equal_zero{}(arg.beta_)) + accuVal += type_convert(arg.out_host_[dst_offset]) * + type_convert(arg.beta_); + + arg.out_host_[dst_offset] = type_convert(accuVal); + arg.out_index_host_[dst_offset] = accuIndex; + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + + std::size_t work_per_thread = + (arg.invariant_index_set_.size() + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t i_begin = it * work_per_thread; + std::size_t i_end = + std::min((it + 1) * work_per_thread, arg.invariant_index_set_.size()); + + auto f = [=] { + for(std::size_t i = i_begin; i < i_end; i++) + { + thread_reduce_func(arg.invariant_index_set_[i]); + } + }; + + threads[it] = joinable_thread(f); + } + }; + } + else + { + using Accumulation = + ck::detail::AccumulateWithNanCheck; + + if constexpr(NumInvariantDim == 0) + { + AccDataType accuVal = ReduceOperation::template GetIdentityValue(); + + for(const auto& reduce_index : arg.reduce_index_set_) + { + auto in_offset = get_offset_from_index(arg.in_reduce_strides_, + reduce_index); + + auto currVal = type_convert(arg.in_host_[in_offset]); + + arg.in_elementwise_op_(currVal, currVal); + + Accumulation::Calculate(accuVal, currVal); + }; + + arg.acc_elementwise_op_(accuVal, accuVal); + + if(!float_equal_one{}(arg.alpha_)) + accuVal *= type_convert(arg.alpha_); + + if(!float_equal_zero{}(arg.beta_)) + accuVal += type_convert(arg.out_host_[0]) * + type_convert(arg.beta_); + + arg.out_host_[0] = type_convert(accuVal); + } + else + { + auto thread_reduce_func = [&](auto invariant_index) { + AccDataType accuVal = + ReduceOperation::template GetIdentityValue(); + + auto in_invariant_offset = get_offset_from_index( + arg.in_invariant_strides_, invariant_index); + + for(const auto& reduce_index : arg.reduce_index_set_) + { + auto in_reduce_offset = get_offset_from_index( + arg.in_reduce_strides_, reduce_index); + + auto currVal = type_convert( + arg.in_host_[in_invariant_offset + in_reduce_offset]); + + arg.in_elementwise_op_(currVal, currVal); + + Accumulation::Calculate(accuVal, currVal); + }; + + arg.acc_elementwise_op_(accuVal, accuVal); + + if(!float_equal_one{}(arg.alpha_)) + accuVal *= type_convert(arg.alpha_); + + auto dst_offset = get_offset_from_index(arg.outStrides_, + invariant_index); + + if(!float_equal_zero{}(arg.beta_)) + accuVal += type_convert(arg.out_host_[dst_offset]) * + type_convert(arg.beta_); + + arg.out_host_[dst_offset] = type_convert(accuVal); + }; + + std::size_t num_thread = std::thread::hardware_concurrency(); + + std::size_t work_per_thread = + (arg.invariant_index_set_.size() + num_thread - 1) / num_thread; + + std::vector threads(num_thread); + + for(std::size_t it = 0; it < num_thread; ++it) + { + std::size_t i_begin = it * work_per_thread; + std::size_t i_end = + std::min((it + 1) * work_per_thread, arg.invariant_index_set_.size()); + + auto f = [=] { + for(std::size_t i = i_begin; i < i_end; i++) + { + thread_reduce_func(arg.invariant_index_set_[i]); + } + }; + + threads[it] = joinable_thread(f); + } + }; + }; + + return (0.0f); + }; + + float Run(const device::BaseArgument* p_arg, + const StreamConfig& stream_config = StreamConfig{}) override + { + return Run(*dynamic_cast(p_arg), stream_config); + }; + }; + + bool IsSupportedArgument(const device::BaseArgument* p_arg) override + { + ignore = p_arg; + + return true; + }; + + std::unique_ptr + MakeArgumentPointer(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, + float alpha, + float beta, + const void* in_host, + const void* in_index_host, + void* out_host, + void* out_index_host, + const InElementwiseOperation in_elementwise_op, + const AccElementwiseOperation acc_elementwise_op) override + { + ignore = in_index_host; + + return std::make_unique(inLengths, + inStrides, + outLengths, + outStrides, + reduceDims, + alpha, + beta, + static_cast(in_host), + static_cast(out_host), + static_cast(out_index_host), + in_elementwise_op, + acc_elementwise_op); + }; + + std::unique_ptr MakeInvokerPointer() override + { + return std::make_unique(); + }; + + std::string GetTypeString() const override + { + auto str = std::stringstream(); + + // clang-format off + str << "Reference_Reduce<" << std::endl; + // clang-format on + + return str.str(); + } +}; + +} // namespace host +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp index 90cfe837df..2cdbfbb0c2 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp @@ -76,8 +76,16 @@ template void add_device_reduce_instance_blockwise( - std::vector>& - device_op_instances) + std::vector>& device_op_instances) { static_for<0, std::tuple_size::value, 1>{}( [&](auto i) { diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp index 521d93e600..4e3fa81f75 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp index fe3fd6c0a7..7ca8bc258a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp index 52a2b69cdd..37398146b8 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp index ee4fee41ea..5eacd358c8 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp index 3abdb7f958..94ae02bf3d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp index b0dbcf31dd..e41e8de6a5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp index 7bbf3df0a3..99762aa64b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp index 559f322261..1fc557a95d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp index 28c9610789..ca3ba4eb0b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp index 5080d28636..28a85782d1 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp index 0d24d15371..ba74400793 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp index c806e807c8..f5c813de78 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp index b7c046e751..e25b6e8493 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp index 771bec1c95..a264d11262 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp index c1fe8addba..8b1d8c95ba 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp index 6bc0662fea..49a60d88c3 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp index 6f8005132d..04a7c2d238 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp index c771ac4fab..d0feefb50d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp index b9ddbb9aea..35f35f202c 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp index 390a719ceb..63eb7221b5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp index 2a9ddbc61b..1bca3c1f43 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp index 5746884442..1791a186f5 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp index ad0f2357e0..3f56c057ef 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp index c7d9527638..a3b8bcf9a0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp index ec56229937..18e0e084d7 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp index 48f66da659..4a106463a3 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp index fabfa5b4c6..23e1c49fe9 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp index e08faec200..62e2d24f02 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp index a1e692aae3..18a54d8686 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp index e9654e8cce..9f408906a7 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp index 7824421309..c40052562f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp index df323d40b3..532bfb417e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); -extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp index acf55d0683..0d08377a22 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp @@ -76,8 +76,16 @@ template void add_device_reduce_instance_multiblock_atomic_add( - std::vector>& - device_op_instances) + std::vector>& device_op_instances) { static_for<0, std::tuple_size::value, diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp index f5102f4977..4cdd45e85b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp index ec513113d9..a36cafb27a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp index 3a3d53b8c6..13b0780497 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp index bbf4398964..75e1f10242 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp index 55147a60e5..00ec17fada 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp index 4bff06c6af..7b762bc932 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp index daffa1aa4d..2a2b284b22 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp index 52c4171123..444d8ddc86 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on // clang-format on diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp index 2f358b06e0..f3c0701761 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp index 84c99dcc57..c57edd0846 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp index dfcc8dd854..f77c50a8e8 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp @@ -62,8 +62,16 @@ template void add_device_reduce_instance_threadwise( - std::vector>& - device_op_instances) + std::vector>& device_op_instances) { using cfg1 = ReductionConfiguration_1<256, 256, 1>; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp index 4168508b28..8960ba7c5b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp index 317006e3a5..95d9c07265 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp index fc7718ddc0..dd6734061e 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp index e6616386ca..85f75110df 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp index a9441b8e8e..7f62f4e010 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp index 6820ace8cf..eee771b133 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp index ab3d4e6e2c..64f1e9c22b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp index ee08c9635b..078561e153 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp index 1007ca27bb..5a9144186b 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp index 1d562c4999..dc4740aa3d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp index 5aac638b1e..9ecc96797f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp index 7a3c764097..ccce78e2f1 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp index 4685d7b5d5..6d3749d868 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp index 1de338fb48..7594dde74d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp index e86c41a949..3272e7f9af 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp index 2ca9008560..519ec8271d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp index 38380e71ec..77b2fb9306 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp index 04c5f3e658..5abb5c5eec 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp index fef5d40884..23bd988b8a 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp index 2416f614c3..7ce5577d7f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp index fbd0285ae8..7e4c5b77f0 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp index 103b85a011..5eca5fea7f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp index e01f590f0e..b0e98411bf 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp index 14a7459bb8..84609a995d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp index 7dfd806012..2f816bb11d 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp index 7670a27c84..9cecd4a5b4 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp index 8bb85f3779..42e9b7fc79 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp index a005ba8d42..494f1c3d71 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp index 9e8c07eb4f..a80abb9247 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp @@ -15,10 +15,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp index a69f88f5a9..53fd286383 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp index 734b31c1e9..df5a4db484 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp index 237bd96966..ed78acd926 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp @@ -15,14 +15,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); -extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp new file mode 100644 index 0000000000..0038fc26da --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp @@ -0,0 +1,117 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" +#include "ck/tensor_operation/gpu/device/device_reduce.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp" +#include "ck/utility/reduction_operator.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +template +struct DeviceOperationInstanceFactory> +{ + using DeviceOp = DeviceReduce; + + using DeviceOpPtr = DeviceReducePtr; + + static auto GetInstances() + { + std::vector op_ptrs; + + constexpr bool out_support_atomic_add = + ck::reduce::InMemoryDataOperationSupportedOnDataType< + InMemoryDataOperationEnum::AtomicAdd, + OutDataType>::value; + constexpr bool op_support_atomic_add = + std::is_same::value && + (std::is_same::value || + std::is_same::value); + constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add); + + add_device_reduce_instance_threadwise(op_ptrs); + + add_device_reduce_instance_blockwise(op_ptrs); + + if constexpr(use_atomic_add) + { + add_device_reduce_instance_multiblock_atomic_add(op_ptrs); + }; + + return op_ptrs; + } +}; + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/utility/host_reduction.hpp b/library/include/ck/library/utility/host_reduction.hpp deleted file mode 100644 index 7c0c969ac5..0000000000 --- a/library/include/ck/library/utility/host_reduction.hpp +++ /dev/null @@ -1,374 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include -#include -#include - -#include "ck/utility/data_type.hpp" -#include "ck/utility/reduction_enums.hpp" -#include "ck/utility/reduction_common.hpp" -#include "ck/utility/reduction_functions_accumulate.hpp" -#include "ck/library/utility/host_common_util.hpp" -#include "ck/library/utility/host_tensor.hpp" - -template -static void get_all_indexes(const std::array& dimLengths, - std::vector>& indexes) -{ - static_assert(NDim >= 1, "NDim >= 1 is required to use this function!"); - - if constexpr(NDim == 1) - { - for(size_t i = 0; i < dimLengths[0]; i++) - { - std::array index{i}; - - indexes.push_back(index); - }; - } - else - { - std::array partial_dim_lengths; - - for(int i = 0; i < NDim - 1; i++) - partial_dim_lengths[i] = dimLengths[i + 1]; - - std::vector> partial_indexes; - - get_all_indexes(partial_dim_lengths, partial_indexes); - - for(size_t i = 0; i < dimLengths[0]; i++) - for(const auto& index : partial_indexes) - { - std::array extIndex; - - extIndex[0] = i; - - for(int k = 0; k < NDim - 1; k++) - extIndex[k + 1] = index[k]; - - indexes.push_back(extIndex); - }; - }; -}; - -template -static size_t get_offset_from_index(const std::array& strides, - const std::array& index) -{ - size_t offset = 0; - - for(int i = 0; i < NDim; i++) - offset += strides[i] * index[i]; - - return (offset); -}; - -template -static size_t get_offset_from_index(const std::vector& strides, - const std::array& index) -{ - size_t offset = 0; - - for(int i = 0; i < NDim; i++) - offset += strides[i] * index[i]; - - return (offset); -}; - -template -struct ReductionHost -{ - using IndexDataType = int32_t; - - static constexpr int NumInvariantDim = Rank - NumReduceDim; - - std::vector outStrides; - - IndexDataType divider; - - std::array reduceLengths; - std::array reduceStrides; - std::array invariantLengths; - std::array invariantStrides; - - std::vector> reduce_dim_indexes; - std::vector> invariant_dim_indexes; - - ReductionHost(HostTensorDescriptor& inDesc, - HostTensorDescriptor& outDesc, - const std::array invariantDims, - const std::array reduceDims) - { - // this->outLengths = to_int_vector(outDesc.GetLengths()); - this->outStrides = outDesc.GetStrides(); - - int product = 1; - - for(int i = 0; i < NumReduceDim; i++) - { - reduceLengths[i] = inDesc.GetLengths()[reduceDims[i]]; - reduceStrides[i] = inDesc.GetStrides()[reduceDims[i]]; - product *= inDesc.GetLengths()[reduceDims[i]]; - }; - - divider = product; - - for(int i = 0; i < NumInvariantDim; i++) - { - invariantLengths[i] = inDesc.GetLengths()[invariantDims[i]]; - invariantStrides[i] = inDesc.GetStrides()[invariantDims[i]]; - }; - - reduce_dim_indexes.clear(); - get_all_indexes(reduceLengths, reduce_dim_indexes); - - if constexpr(NumInvariantDim > 0) - { - invariant_dim_indexes.clear(); - get_all_indexes(invariantLengths, invariant_dim_indexes); - }; - }; - - void Run(float alpha, - const InDataType* in_data, - float beta, - OutDataType* out_data, - IndexDataType* out_indices, - InElementwiseOperation in_elementwise_op, - AccElementwiseOperation acc_elementwise_op) - { - if constexpr(OutputIndex) - { - RunImpl_with_index( - alpha, in_data, beta, out_data, out_indices, in_elementwise_op, acc_elementwise_op); - } - else - { - RunImpl_no_index(alpha, in_data, beta, out_data, in_elementwise_op, acc_elementwise_op); - }; - }; - - void RunImpl_with_index(float alpha, - const InDataType* in_data, - float beta, - OutDataType* out_data, - IndexDataType* out_indices, - InElementwiseOperation in_elementwise_op, - AccElementwiseOperation acc_elementwise_op) - { - using ck::float_equal_one; - using ck::float_equal_zero; - using ck::type_convert; - - using Accumulation = ck::detail::AccumulateWithIndexAndNanCheck; - - if constexpr(NumInvariantDim == 0) - { - AccDataType accuVal = ReduceOperation::template GetIdentityValue(); - IndexDataType accuIndex = 0; - - for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++) - { - auto offset_reduce = - get_offset_from_index(reduceStrides, reduce_dim_indexes[i]); - - auto currVal = type_convert(in_data[offset_reduce]); - - in_elementwise_op(currVal, currVal); - - auto currIndex = static_cast(i); - - Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex); - }; - - acc_elementwise_op(accuVal, accuVal); - - if(!float_equal_one{}(alpha)) - accuVal *= type_convert(alpha); - - if(!float_equal_zero{}(beta)) - accuVal += type_convert(out_data[0]) * type_convert(beta); - - out_data[0] = type_convert(accuVal); - out_indices[0] = accuIndex; - } - else - { - auto thread_reduce_func = [&](auto invariant_index) { - AccDataType accuVal = ReduceOperation::template GetIdentityValue(); - IndexDataType accuIndex = 0; - - auto offset_invariant = - get_offset_from_index(invariantStrides, invariant_index); - - for(std::size_t i = 0; i < reduce_dim_indexes.size(); i++) - { - auto offset_reduce = - get_offset_from_index(reduceStrides, reduce_dim_indexes[i]); - - auto currVal = - type_convert(in_data[offset_invariant + offset_reduce]); - - in_elementwise_op(currVal, currVal); - - auto currIndex = static_cast(i); - - Accumulation::Calculate(accuVal, currVal, accuIndex, currIndex); - }; - - acc_elementwise_op(accuVal, accuVal); - - if(!float_equal_one{}(alpha)) - accuVal *= type_convert(alpha); - - auto dst_offset = - get_offset_from_index(outStrides, invariant_index); - - if(!float_equal_zero{}(beta)) - accuVal += type_convert(out_data[dst_offset]) * - type_convert(beta); - - out_data[dst_offset] = type_convert(accuVal); - out_indices[dst_offset] = accuIndex; - }; - - std::size_t num_thread = 1; - std::size_t work_per_thread = - (invariant_dim_indexes.size() + num_thread - 1) / num_thread; - - std::vector threads(num_thread); - - for(std::size_t it = 0; it < num_thread; ++it) - { - std::size_t iw_begin = it * work_per_thread; - std::size_t iw_end = - std::min((it + 1) * work_per_thread, invariant_dim_indexes.size()); - - auto f = [=] { - for(std::size_t iw = iw_begin; iw < iw_end; ++iw) - { - thread_reduce_func(invariant_dim_indexes[iw]); - } - }; - - threads[it] = joinable_thread(f); - } - }; - }; - - void RunImpl_no_index(float alpha, - const InDataType* in_data, - float beta, - OutDataType* out_data, - InElementwiseOperation in_elementwise_op, - AccElementwiseOperation acc_elementwise_op) - { - using ck::float_equal_one; - using ck::float_equal_zero; - using ck::type_convert; - - using Accumulation = - ck::detail::AccumulateWithNanCheck; - - if constexpr(NumInvariantDim == 0) - { - AccDataType accuVal = ReduceOperation::template GetIdentityValue(); - - for(const auto& reduce_index : reduce_dim_indexes) - { - auto offset_reduce = - get_offset_from_index(reduceStrides, reduce_index); - - auto currVal = type_convert(in_data[offset_reduce]); - - in_elementwise_op(currVal, currVal); - - Accumulation::Calculate(accuVal, currVal); - }; - - acc_elementwise_op(accuVal, accuVal); - - if(!float_equal_one{}(alpha)) - accuVal *= type_convert(alpha); - - if(!float_equal_zero{}(beta)) - accuVal += type_convert(out_data[0]) * type_convert(beta); - - out_data[0] = type_convert(accuVal); - } - else - { - auto thread_reduce_func = [&](auto invariant_index) { - AccDataType accuVal = ReduceOperation::template GetIdentityValue(); - - auto offset_invariant = - get_offset_from_index(invariantStrides, invariant_index); - - for(const auto& reduce_index : reduce_dim_indexes) - { - auto offset_reduce = - get_offset_from_index(reduceStrides, reduce_index); - - auto currVal = - type_convert(in_data[offset_invariant + offset_reduce]); - - in_elementwise_op(currVal, currVal); - - Accumulation::Calculate(accuVal, currVal); - }; - - acc_elementwise_op(accuVal, accuVal); - - if(!float_equal_one{}(alpha)) - accuVal *= type_convert(alpha); - - auto dst_offset = - get_offset_from_index(outStrides, invariant_index); - - if(!float_equal_zero{}(beta)) - accuVal += type_convert(out_data[dst_offset]) * - type_convert(beta); - - out_data[dst_offset] = type_convert(accuVal); - }; - - std::size_t num_thread = 1; - std::size_t work_per_thread = - (invariant_dim_indexes.size() + num_thread - 1) / num_thread; - - std::vector threads(num_thread); - - for(std::size_t it = 0; it < num_thread; ++it) - { - std::size_t iw_begin = it * work_per_thread; - std::size_t iw_end = - std::min((it + 1) * work_per_thread, invariant_dim_indexes.size()); - - auto f = [=] { - for(std::size_t iw = iw_begin; iw < iw_end; ++iw) - { - thread_reduce_func(invariant_dim_indexes[iw]); - } - }; - - threads[it] = joinable_thread(f); - } - }; - }; -}; diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp index 1909183a55..cf46059a0d 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp index ec30201021..0043b19844 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp index 89f3e58280..6f702ddf1f 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp index f1bdd1927b..d1f70dc99e 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp index 58e9c56229..a957981a55 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp index e5012c651a..550a9cd76c 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp index 0970cb9d7c..58cb6ee348 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp index 6ee179a511..1ac5e79bc1 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp index e53b403065..b1e1a06800 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp index cab5738fba..1a15b32d23 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp index 7d2a4fad2a..119f384b4e 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp index e08b64f8b3..3f1bd86b8b 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp index 89cabf3762..b507f0d1f9 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp index 1e602c121d..04d0ea2e8b 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp index 489b4bc452..3de561f2b6 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp index 04e2c5b164..3f45b03134 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp index 5c0e536048..76851d9b72 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp index 899dfcd37c..9cef019320 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp index 5624337a47..ce73ec47e3 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp index 2f3067ce29..ed6091f924 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp index 2648e7d59d..4c8375de16 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp index f67ae2ee7c..0fa93ab688 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp index 6f8e07851d..821eec1751 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp index 69fecf72f5..0305b4945f 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp index 129a4f0f0e..1bda0bcc71 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp index 21babc4aa6..7f8018a04e 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp index b85b3e2b68..887a89cc2b 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp index 24a8293b5d..0cc810363d 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp index 73e60fa959..4c825a9f1b 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp index 72e649d897..bf26913fd3 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp index a7e053a065..629299c7b1 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp index 0e3abd35b4..9a08634498 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); -template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp index 4b32456074..6dc925bd6f 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp index 3298587a42..470d68d372 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp index 729d4fd6e1..39303ab580 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp index e3e36e312b..a5481784ed 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp index e7580e7d7d..aa6e6d3cc9 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp index 1e6feb0071..b1ea551eaa 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp index 669c4d34ca..2ba83132d8 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp index 335a5474ce..b9018e7c6a 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp index e95e8391a2..1abc6de552 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp index 25498158a2..f3a017aeb4 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); -template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp index 7262b8a5ba..329617bb43 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp index c526a74f1a..1e4d43debd 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp index 4c7252e742..f9f79675f5 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp index 618900a7d7..d3e7268c5a 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp index ce747cbc76..a41a12386d 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp index 06f622b9e6..6da1acc4e5 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp index 708eb58d40..f14b8a4031 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp index c8a62fa149..5a9f08167c 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp index ce2092153c..d3aff06753 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp index 29251a8b9a..55f7537d83 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp index 734fa9fd3e..70f91168d8 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp index d7a0e2bfe8..47f5e67fe0 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp index 8b97f3008b..eae489ff04 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp index 53d01e38d6..9fb267a201 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp index 125d054f3d..fecb2691f8 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp index fb86a2bbe4..232d2b858b 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp index 49af08390a..07d45c4ca9 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp index 30cc1b13ec..596a062f3a 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp index 24f8a9ba5c..7270cefe8b 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp index a26702f053..d0f4ef3dff 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp index 34fe32628f..9c6bce92f4 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp index 74b15eddba..5faf8d8283 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp index 65762492f7..8f3c72451a 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp index 5e74295a0d..8d7794f42e 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp index 6fdea6cc4d..4a32543a13 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp index 317d573dac..26d571c84e 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp index 29f95ebcc7..ae56a2a919 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp index aa9f47cbc4..aae3233c9f 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on // clang-format on diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp index 54a9dd1ab7..94d5d3fa2f 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp @@ -11,10 +11,10 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp index 4ef5717b5e..dad190a634 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp index 140a3c197b..b7ca6998f5 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp index 317b4ad39c..22c40187ea 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp @@ -11,14 +11,14 @@ namespace instance { // clang-format off // InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); -template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); // clang-format on } // namespace instance diff --git a/profiler/include/profiler/profile_reduce_impl.hpp b/profiler/include/profiler/profile_reduce_impl.hpp index ccb99398f2..0759c53a3c 100644 --- a/profiler/include/profiler/profile_reduce_impl.hpp +++ b/profiler/include/profiler/profile_reduce_impl.hpp @@ -6,11 +6,11 @@ #include "ck/utility/reduction_enums.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/reduce.hpp" #include "ck/library/utility/algorithm.hpp" #include "ck/library/utility/check_err.hpp" #include "ck/library/utility/device_memory.hpp" -#include "ck/library/utility/host_reduction.hpp" +#include "ck/library/reference_tensor_operation/cpu/reference_reduce.hpp" #include "ck/library/utility/host_common_util.hpp" #include "ck/library/utility/host_tensor_generator.hpp" @@ -158,11 +158,6 @@ bool profile_reduce_impl_impl(bool do_verification, constexpr bool OutputIndex = (op_support_indices && UseIndex); - constexpr bool out_support_atomic_add = std::is_same::value; - constexpr bool op_support_atomic_add = - !op_support_indices && ReduceOpId != ReduceTensorOp::NORM2; - constexpr bool use_atomic_add = (out_support_atomic_add && op_support_atomic_add); - // 1) If InDataType is half_t, must use half_t as AccDataType for indexable reduction operations // 2) If InDataType is half_t, must use float as AccDataType for non-indexable reduction // operations @@ -200,7 +195,8 @@ bool profile_reduce_impl_impl(bool do_verification, constexpr bool invalid_reduce = (invalid_reduce_1 || invalid_reduce_2 || invalid_reduce_3 || invalid_reduce_4 || invalid_reduce_5 || invalid_reduce_6); - bool pass = true; + int num_kernel = 0; + bool pass = true; if constexpr(!invalid_reduce) { @@ -286,75 +282,25 @@ bool profile_reduce_impl_impl(bool do_verification, reduce_unary_operator::GetElementwiseOperator( static_cast(reduce_total_length)); - using DeviceReduceInstPtr = - DeviceReducePtr; - - std::vector reduce_ptrs; - - add_device_reduce_instance_threadwise(reduce_ptrs); - - add_device_reduce_instance_blockwise(reduce_ptrs); - - if constexpr(use_atomic_add) - { - add_device_reduce_instance_multiblock_atomic_add(reduce_ptrs); - } + using ReduceOp = ck::tensor_operation::device::DeviceReduce; + const auto reduce_ptrs = + ck::tensor_operation::device::instance::DeviceOperationInstanceFactory< + ReduceOp>::GetInstances(); if(reduce_ptrs.empty()) { throw std::runtime_error("Wrong! No device REDUCE instance found"); }; - if(do_verification) - { - ReductionHost - hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims); - - hostReduce.Run(alpha, - in.mData.data(), - beta, - out_ref.mData.data(), - out_indices_ref.mData.data(), - in_elementwise_op, - acc_elementwise_op); - }; - std::array arrInLengths; std::array arrInStrides; std::array arrOutLengths; @@ -365,6 +311,49 @@ bool profile_reduce_impl_impl(bool do_verification, ck::ranges::copy(outLengths, arrOutLengths.begin()); ck::ranges::copy(outStrides, arrOutStrides.begin()); + if(do_verification) + { + using ReferenceReduceInstance = + ck::tensor_operation::host::ReferenceReduce; + + auto reduce_ref = ReferenceReduceInstance{}; + + auto argument_ptr_ref = reduce_ref.MakeArgumentPointer(arrInLengths, + arrInStrides, + arrOutLengths, + arrOutStrides, + reduceDims, + alpha, + beta, + in.mData.data(), + nullptr, + out_ref.mData.data(), + out_indices_ref.mData.data(), + in_elementwise_op, + acc_elementwise_op); + + if(!reduce_ref.IsSupportedArgument(argument_ptr_ref.get())) + { + std::cout + << "The runtime parameters not supported by the reduce reference, exiting!" + << std::endl; + return (false); + }; + + auto invoker_ptr_ref = reduce_ref.MakeInvokerPointer(); + + (void)invoker_ptr_ref->Run(argument_ptr_ref.get()); + }; + for(auto& reduce_ptr : reduce_ptrs) { auto argument_ptr = reduce_ptr->MakeArgumentPointer(arrInLengths, @@ -383,6 +372,8 @@ bool profile_reduce_impl_impl(bool do_verification, if(!reduce_ptr->IsSupportedArgument(argument_ptr.get())) continue; + else + num_kernel++; std::string reduce_name = reduce_ptr->GetTypeString(); @@ -446,14 +437,20 @@ bool profile_reduce_impl_impl(bool do_verification, }; }; - if(time_kernel) + if(time_kernel && num_kernel > 0) std::cout << "Best Perf: " << best_avg_time << " ms, " << best_gb_per_sec << " GB/s" << std::endl; } else { - std::cout << "The requested reduction operation is not supported, please check !!!" - << std::endl; + throw std::runtime_error( + "The requested reduction operation is not supported, please check!"); + }; + + if(num_kernel == 0) + { + std::cout << "Error: No kernel is applicable" << std::endl; + return false; }; return pass;