From 95dacf7c9659d2e51ac431bd77f8106eaa811814 Mon Sep 17 00:00:00 2001 From: Qianfeng Date: Tue, 25 Oct 2022 23:39:11 +0800 Subject: [PATCH] Update to the Reduction API and instances (#476) * Simplify the macros for declaring and defining the add_device_reduce_instance_xxxx() instances * Change the types of lengths and strides from std::vector to std::array for the reduction device interfaces * Remove DeviceSoftmaxImpl's depending on DeviceReduceMultiblock * Split the cpp and hpp files for reduction instances to enable more parallel compiling * Remove the using of macros for declaring reduction instances and instance references * Update to add_device_reduce_instance_xxxx templated functions * Use ReduceOperation+InElementwiseOp+AccElementwiseOp to repace the ReduceOpId in defining add_reduce_instance_xxxx() templates * Change return format [ROCm/composable_kernel commit: dda3a0a10bb62a6d47e3559b89146a9d02361502] --- example/12_reduce/reduce_blockwise.cpp | 6 +- example/12_reduce/reduce_blockwise_impl.hpp | 30 +-- .../12_reduce/reduce_blockwise_two_call.cpp | 52 ++--- example/12_reduce/reduce_example_common.hpp | 13 +- .../reduce_multiblock_atomic_add.cpp | 6 +- .../reduce_multiblock_atomic_add_impl.hpp | 30 +-- .../gpu/device/device_reduce.hpp | 32 +-- .../device/impl/device_reduce_multiblock.hpp | 80 +++---- .../device/impl/device_reduce_threadwise.hpp | 66 +++--- .../gpu/device/impl/device_softmax_impl.hpp | 195 ++++++++++++----- .../device_operation_instance_factory.hpp | 7 +- .../gpu/reduce/device_reduce_instance.hpp | 95 +++++++-- .../device_reduce_instance_blockwise.hpp | 77 +------ ..._reduce_instance_blockwise_b16_f32_b16.hpp | 59 ------ ...uce_instance_blockwise_b16_f32_b16_add.hpp | 27 +++ ...ce_instance_blockwise_b16_f32_b16_amax.hpp | 31 +++ ...uce_instance_blockwise_b16_f32_b16_avg.hpp | 27 +++ ...uce_instance_blockwise_b16_f32_b16_max.hpp | 31 +++ ...uce_instance_blockwise_b16_f32_b16_min.hpp | 31 +++ ...e_instance_blockwise_b16_f32_b16_norm2.hpp | 27 +++ ..._reduce_instance_blockwise_f16_f16_f16.hpp | 46 ---- ...ce_instance_blockwise_f16_f16_f16_amax.hpp | 31 +++ ...uce_instance_blockwise_f16_f16_f16_max.hpp | 31 +++ ...uce_instance_blockwise_f16_f16_f16_min.hpp | 31 +++ ..._reduce_instance_blockwise_f16_f32_f16.hpp | 34 --- ...uce_instance_blockwise_f16_f32_f16_add.hpp | 27 +++ ...uce_instance_blockwise_f16_f32_f16_avg.hpp | 27 +++ ...e_instance_blockwise_f16_f32_f16_norm2.hpp | 27 +++ ..._reduce_instance_blockwise_f32_f32_f32.hpp | 58 ------ ...uce_instance_blockwise_f32_f32_f32_add.hpp | 27 +++ ...ce_instance_blockwise_f32_f32_f32_amax.hpp | 31 +++ ...uce_instance_blockwise_f32_f32_f32_avg.hpp | 27 +++ ...uce_instance_blockwise_f32_f32_f32_max.hpp | 31 +++ ...uce_instance_blockwise_f32_f32_f32_min.hpp | 31 +++ ...e_instance_blockwise_f32_f32_f32_norm2.hpp | 27 +++ ..._reduce_instance_blockwise_f32_f64_f32.hpp | 34 --- ...uce_instance_blockwise_f32_f64_f32_add.hpp | 27 +++ ...uce_instance_blockwise_f32_f64_f32_avg.hpp | 27 +++ ...e_instance_blockwise_f32_f64_f32_norm2.hpp | 27 +++ ..._reduce_instance_blockwise_f64_f64_f64.hpp | 58 ------ ...uce_instance_blockwise_f64_f64_f64_add.hpp | 27 +++ ...ce_instance_blockwise_f64_f64_f64_amax.hpp | 31 +++ ...uce_instance_blockwise_f64_f64_f64_avg.hpp | 27 +++ ...uce_instance_blockwise_f64_f64_f64_max.hpp | 31 +++ ...uce_instance_blockwise_f64_f64_f64_min.hpp | 31 +++ ...e_instance_blockwise_f64_f64_f64_norm2.hpp | 27 +++ ...ce_reduce_instance_blockwise_i8_i32_i8.hpp | 30 --- ...educe_instance_blockwise_i8_i32_i8_add.hpp | 27 +++ ...educe_instance_blockwise_i8_i32_i8_avg.hpp | 27 +++ ...ice_reduce_instance_blockwise_i8_i8_i8.hpp | 46 ---- ...educe_instance_blockwise_i8_i8_i8_amax.hpp | 31 +++ ...reduce_instance_blockwise_i8_i8_i8_max.hpp | 31 +++ ...reduce_instance_blockwise_i8_i8_i8_min.hpp | 31 +++ .../device_reduce_instance_impl_common.hpp | 13 ++ ..._reduce_instance_multiblock_atomic_add.hpp | 152 ++++---------- ...ance_multiblock_atomic_add_b16_f32_f32.hpp | 30 --- ..._multiblock_atomic_add_b16_f32_f32_add.hpp | 27 +++ ..._multiblock_atomic_add_b16_f32_f32_avg.hpp | 27 +++ ...ance_multiblock_atomic_add_f16_f32_f32.hpp | 30 --- ..._multiblock_atomic_add_f16_f32_f32_add.hpp | 27 +++ ..._multiblock_atomic_add_f16_f32_f32_avg.hpp | 27 +++ ...ance_multiblock_atomic_add_f32_f32_f32.hpp | 30 --- ..._multiblock_atomic_add_f32_f32_f32_add.hpp | 27 +++ ..._multiblock_atomic_add_f32_f32_f32_avg.hpp | 27 +++ ...ance_multiblock_atomic_add_f32_f64_f32.hpp | 30 --- ..._multiblock_atomic_add_f32_f64_f32_add.hpp | 28 +++ ..._multiblock_atomic_add_f32_f64_f32_avg.hpp | 28 +++ ...ance_multiblock_atomic_add_f64_f64_f64.hpp | 30 --- ..._multiblock_atomic_add_f64_f64_f64_add.hpp | 27 +++ ..._multiblock_atomic_add_f64_f64_f64_avg.hpp | 27 +++ .../device_reduce_instance_threadwise.hpp | 77 +------ ...reduce_instance_threadwise_b16_f32_b16.hpp | 59 ------ ...ce_instance_threadwise_b16_f32_b16_add.hpp | 27 +++ ...e_instance_threadwise_b16_f32_b16_amax.hpp | 31 +++ ...ce_instance_threadwise_b16_f32_b16_avg.hpp | 27 +++ ...ce_instance_threadwise_b16_f32_b16_max.hpp | 31 +++ ...ce_instance_threadwise_b16_f32_b16_min.hpp | 31 +++ ..._instance_threadwise_b16_f32_b16_norm2.hpp | 27 +++ ...reduce_instance_threadwise_f16_f16_f16.hpp | 46 ---- ...e_instance_threadwise_f16_f16_f16_amax.hpp | 31 +++ ...ce_instance_threadwise_f16_f16_f16_max.hpp | 31 +++ ...ce_instance_threadwise_f16_f16_f16_min.hpp | 31 +++ ...reduce_instance_threadwise_f16_f32_f16.hpp | 34 --- ...ce_instance_threadwise_f16_f32_f16_add.hpp | 27 +++ ...ce_instance_threadwise_f16_f32_f16_avg.hpp | 27 +++ ..._instance_threadwise_f16_f32_f16_norm2.hpp | 27 +++ ...reduce_instance_threadwise_f32_f32_f32.hpp | 58 ------ ...ce_instance_threadwise_f32_f32_f32_add.hpp | 27 +++ ...e_instance_threadwise_f32_f32_f32_amax.hpp | 31 +++ ...ce_instance_threadwise_f32_f32_f32_avg.hpp | 27 +++ ...ce_instance_threadwise_f32_f32_f32_max.hpp | 31 +++ ...ce_instance_threadwise_f32_f32_f32_min.hpp | 31 +++ ..._instance_threadwise_f32_f32_f32_norm2.hpp | 27 +++ ...reduce_instance_threadwise_f32_f64_f32.hpp | 34 --- ...ce_instance_threadwise_f32_f64_f32_add.hpp | 27 +++ ...ce_instance_threadwise_f32_f64_f32_avg.hpp | 27 +++ ..._instance_threadwise_f32_f64_f32_norm2.hpp | 27 +++ ...reduce_instance_threadwise_f64_f64_f64.hpp | 58 ------ ...ce_instance_threadwise_f64_f64_f64_add.hpp | 27 +++ ...e_instance_threadwise_f64_f64_f64_amax.hpp | 31 +++ ...ce_instance_threadwise_f64_f64_f64_avg.hpp | 27 +++ ...ce_instance_threadwise_f64_f64_f64_max.hpp | 31 +++ ...ce_instance_threadwise_f64_f64_f64_min.hpp | 31 +++ ..._instance_threadwise_f64_f64_f64_norm2.hpp | 27 +++ ...e_reduce_instance_threadwise_i8_i32_i8.hpp | 30 --- ...duce_instance_threadwise_i8_i32_i8_add.hpp | 27 +++ ...duce_instance_threadwise_i8_i32_i8_avg.hpp | 27 +++ ...ce_reduce_instance_threadwise_i8_i8_i8.hpp | 46 ---- ...duce_instance_threadwise_i8_i8_i8_amax.hpp | 31 +++ ...educe_instance_threadwise_i8_i8_i8_max.hpp | 31 +++ ...educe_instance_threadwise_i8_i8_i8_min.hpp | 31 +++ .../ck/library/utility/host_reduction.hpp | 10 +- .../gpu/reduce/CMakeLists.txt | 95 +++++++-- ..._reduce_instance_blockwise_b16_f32_b16.cpp | 56 ----- ...uce_instance_blockwise_b16_f32_b16_add.cpp | 24 +++ ...ce_instance_blockwise_b16_f32_b16_amax.cpp | 28 +++ ...uce_instance_blockwise_b16_f32_b16_avg.cpp | 24 +++ ...uce_instance_blockwise_b16_f32_b16_max.cpp | 28 +++ ...uce_instance_blockwise_b16_f32_b16_min.cpp | 28 +++ ...e_instance_blockwise_b16_f32_b16_norm2.cpp | 24 +++ ..._reduce_instance_blockwise_f16_f16_f16.cpp | 43 ---- ...ce_instance_blockwise_f16_f16_f16_amax.cpp | 28 +++ ...uce_instance_blockwise_f16_f16_f16_max.cpp | 28 +++ ...uce_instance_blockwise_f16_f16_f16_min.cpp | 28 +++ ..._reduce_instance_blockwise_f16_f32_f16.cpp | 31 --- ...uce_instance_blockwise_f16_f32_f16_add.cpp | 24 +++ ...uce_instance_blockwise_f16_f32_f16_avg.cpp | 24 +++ ...e_instance_blockwise_f16_f32_f16_norm2.cpp | 24 +++ ..._reduce_instance_blockwise_f32_f32_f32.cpp | 55 ----- ...uce_instance_blockwise_f32_f32_f32_add.cpp | 24 +++ ...ce_instance_blockwise_f32_f32_f32_amax.cpp | 28 +++ ...uce_instance_blockwise_f32_f32_f32_avg.cpp | 24 +++ ...uce_instance_blockwise_f32_f32_f32_max.cpp | 28 +++ ...uce_instance_blockwise_f32_f32_f32_min.cpp | 28 +++ ...e_instance_blockwise_f32_f32_f32_norm2.cpp | 25 +++ ..._reduce_instance_blockwise_f32_f64_f32.cpp | 30 --- ...uce_instance_blockwise_f32_f64_f32_add.cpp | 23 ++ ...uce_instance_blockwise_f32_f64_f32_avg.cpp | 23 ++ ...e_instance_blockwise_f32_f64_f32_norm2.cpp | 23 ++ ..._reduce_instance_blockwise_f64_f64_f64.cpp | 55 ----- ...uce_instance_blockwise_f64_f64_f64_add.cpp | 24 +++ ...ce_instance_blockwise_f64_f64_f64_amax.cpp | 28 +++ ...uce_instance_blockwise_f64_f64_f64_avg.cpp | 24 +++ ...uce_instance_blockwise_f64_f64_f64_max.cpp | 28 +++ ...uce_instance_blockwise_f64_f64_f64_min.cpp | 28 +++ ...e_instance_blockwise_f64_f64_f64_norm2.cpp | 24 +++ ...ce_reduce_instance_blockwise_i8_i32_i8.cpp | 27 --- ...educe_instance_blockwise_i8_i32_i8_add.cpp | 24 +++ ...educe_instance_blockwise_i8_i32_i8_avg.cpp | 24 +++ ...ice_reduce_instance_blockwise_i8_i8_i8.cpp | 43 ---- ...educe_instance_blockwise_i8_i8_i8_amax.cpp | 28 +++ ...reduce_instance_blockwise_i8_i8_i8_max.cpp | 28 +++ ...reduce_instance_blockwise_i8_i8_i8_min.cpp | 28 +++ ...ance_multiblock_atomic_add_b16_f32_f32.cpp | 26 --- ..._multiblock_atomic_add_b16_f32_f32_add.cpp | 23 ++ ..._multiblock_atomic_add_b16_f32_f32_avg.cpp | 23 ++ ...ance_multiblock_atomic_add_f16_f32_f32.cpp | 27 --- ..._multiblock_atomic_add_f16_f32_f32_add.cpp | 24 +++ ..._multiblock_atomic_add_f16_f32_f32_avg.cpp | 24 +++ ...ance_multiblock_atomic_add_f32_f32_f32.cpp | 26 --- ..._multiblock_atomic_add_f32_f32_f32_add.cpp | 23 ++ ..._multiblock_atomic_add_f32_f32_f32_avg.cpp | 23 ++ ...ance_multiblock_atomic_add_f32_f64_f32.cpp | 26 --- ..._multiblock_atomic_add_f32_f64_f32_add.cpp | 23 ++ ..._multiblock_atomic_add_f32_f64_f32_avg.cpp | 23 ++ ...ance_multiblock_atomic_add_f64_f64_f64.cpp | 27 --- ..._multiblock_atomic_add_f64_f64_f64_add.cpp | 24 +++ ..._multiblock_atomic_add_f64_f64_f64_avg.cpp | 24 +++ ...reduce_instance_threadwise_b16_f32_b16.cpp | 56 ----- ...ce_instance_threadwise_b16_f32_b16_add.cpp | 24 +++ ...e_instance_threadwise_b16_f32_b16_amax.cpp | 28 +++ ...ce_instance_threadwise_b16_f32_b16_avg.cpp | 24 +++ ...ce_instance_threadwise_b16_f32_b16_max.cpp | 28 +++ ...ce_instance_threadwise_b16_f32_b16_min.cpp | 28 +++ ..._instance_threadwise_b16_f32_b16_norm2.cpp | 24 +++ ...reduce_instance_threadwise_f16_f16_f16.cpp | 43 ---- ...e_instance_threadwise_f16_f16_f16_amax.cpp | 28 +++ ...ce_instance_threadwise_f16_f16_f16_max.cpp | 28 +++ ...ce_instance_threadwise_f16_f16_f16_min.cpp | 28 +++ ...reduce_instance_threadwise_f16_f32_f16.cpp | 30 --- ...ce_instance_threadwise_f16_f32_f16_add.cpp | 23 ++ ...ce_instance_threadwise_f16_f32_f16_avg.cpp | 23 ++ ..._instance_threadwise_f16_f32_f16_norm2.cpp | 23 ++ ...reduce_instance_threadwise_f32_f32_f32.cpp | 55 ----- ...ce_instance_threadwise_f32_f32_f32_add.cpp | 24 +++ ...e_instance_threadwise_f32_f32_f32_amax.cpp | 28 +++ ...ce_instance_threadwise_f32_f32_f32_avg.cpp | 24 +++ ...ce_instance_threadwise_f32_f32_f32_max.cpp | 28 +++ ...ce_instance_threadwise_f32_f32_f32_min.cpp | 28 +++ ..._instance_threadwise_f32_f32_f32_norm2.cpp | 24 +++ ...reduce_instance_threadwise_f32_f64_f32.cpp | 31 --- ...ce_instance_threadwise_f32_f64_f32_add.cpp | 24 +++ ...ce_instance_threadwise_f32_f64_f32_avg.cpp | 24 +++ ..._instance_threadwise_f32_f64_f32_norm2.cpp | 24 +++ ...reduce_instance_threadwise_f64_f64_f64.cpp | 54 ----- ...ce_instance_threadwise_f64_f64_f64_add.cpp | 23 ++ ...e_instance_threadwise_f64_f64_f64_amax.cpp | 27 +++ ...ce_instance_threadwise_f64_f64_f64_avg.cpp | 23 ++ ...ce_instance_threadwise_f64_f64_f64_max.cpp | 27 +++ ...ce_instance_threadwise_f64_f64_f64_min.cpp | 27 +++ ..._instance_threadwise_f64_f64_f64_norm2.cpp | 23 ++ ...e_reduce_instance_threadwise_i8_i32_i8.cpp | 28 --- ...duce_instance_threadwise_i8_i32_i8_add.cpp | 25 +++ ...duce_instance_threadwise_i8_i32_i8_avg.cpp | 24 +++ ...ce_reduce_instance_threadwise_i8_i8_i8.cpp | 43 ---- ...duce_instance_threadwise_i8_i8_i8_amax.cpp | 28 +++ ...educe_instance_threadwise_i8_i8_i8_max.cpp | 28 +++ ...educe_instance_threadwise_i8_i8_i8_min.cpp | 28 +++ profiler/include/profile_reduce_impl.hpp | 197 ++++++++++-------- 209 files changed, 4646 insertions(+), 2279 deletions(-) delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp delete mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp create mode 100644 library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp delete mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp create mode 100644 library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp diff --git a/example/12_reduce/reduce_blockwise.cpp b/example/12_reduce/reduce_blockwise.cpp index c1bcdbb826..fb9a6e6407 100644 --- a/example/12_reduce/reduce_blockwise.cpp +++ b/example/12_reduce/reduce_blockwise.cpp @@ -140,6 +140,10 @@ bool reduce_blockwise_test(bool do_verification, if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) return; + std::array arrReduceDims; + + std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin()); + result = reduce_blockwise_impl( - do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); + do_verification, init_method, time_kernel, inLengths, arrReduceDims, alpha, beta); matched = true; }); diff --git a/example/12_reduce/reduce_blockwise_impl.hpp b/example/12_reduce/reduce_blockwise_impl.hpp index 1d2769ea9e..ad5537eb45 100644 --- a/example/12_reduce/reduce_blockwise_impl.hpp +++ b/example/12_reduce/reduce_blockwise_impl.hpp @@ -30,7 +30,7 @@ int reduce_blockwise_impl(bool do_verification, int init_method, bool time_kernel, const std::vector& inLengths, - const std::vector& reduceDims, + const std::array& reduceDims, float alpha, float beta) @@ -38,6 +38,8 @@ int reduce_blockwise_impl(bool do_verification, using namespace ck; using namespace ck::tensor_operation::device; + constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim; + constexpr bool op_support_indices = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX); @@ -143,7 +145,7 @@ int reduce_blockwise_impl(bool do_verification, std::vector outLengths; - std::vector invariantDims = get_invariant_dims(reduceDims); + auto invariantDims = get_invariant_dims(reduceDims); if(invariantDims.empty()) outLengths.push_back(1); @@ -256,22 +258,22 @@ int reduce_blockwise_impl(bool do_verification, acc_elementwise_op); }; - std::vector i_inLengths; - std::vector i_inStrides; - std::vector i_outLengths; - std::vector i_outStrides; + std::array arrInLengths; + std::array arrInStrides; + std::array arrOutLengths; + std::array arrOutStrides; - i_inLengths.assign(inLengths.begin(), inLengths.end()); - i_inStrides.assign(inStrides.begin(), inStrides.end()); - i_outLengths.assign(outLengths.begin(), outLengths.end()); - i_outStrides.assign(outStrides.begin(), outStrides.end()); + std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin()); + std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin()); + std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); + std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); auto reduce = DeviceReduceInstance{}; - auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, + auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths, + arrInStrides, + arrOutLengths, + arrOutStrides, reduceDims, alpha, beta, diff --git a/example/12_reduce/reduce_blockwise_two_call.cpp b/example/12_reduce/reduce_blockwise_two_call.cpp index a84856c33f..a5c24b13a2 100644 --- a/example/12_reduce/reduce_blockwise_two_call.cpp +++ b/example/12_reduce/reduce_blockwise_two_call.cpp @@ -90,15 +90,15 @@ static bool time_kernel; int main(int argc, char* argv[]) { // used by the device reduction - const std::vector reduceDims_1 = {4}; - const std::vector invariantDims_1 = {0, 1, 2, 3}; + const std::array reduceDims_1 = {4}; + // const std::array invariantDims_1 = {0, 1, 2, 3}; - const std::vector reduceDims_2 = {3}; - const std::vector invariantDims_2 = {0, 1, 2}; + const std::array reduceDims_2 = {3}; + // const std::array invariantDims_2 = {0, 1, 2}; // used by the host reduction - const std::vector reduceDims = {3, 4}; - const std::vector invariantDims = {0, 1, 2}; + const std::array reduceDims = {3, 4}; + const std::array invariantDims = {0, 1, 2}; const std::vector inLengths_1 = {64, 320, 80, 4, 128}; @@ -214,26 +214,26 @@ int main(int argc, char* argv[]) acc_elementwise_op); }; - std::vector i_inLengths_1; - std::vector i_inStrides_1; - std::vector i_inLengths_2; - std::vector i_inStrides_2; - std::vector i_outLengths; - std::vector i_outStrides; + std::array arrInLengths_1; + std::array arrInStrides_1; + std::array arrInLengths_2; + std::array arrInStrides_2; + std::array arrOutLengths; + std::array arrOutStrides; - i_inLengths_1.assign(inLengths_1.begin(), inLengths_1.end()); - i_inStrides_1.assign(inStrides_1.begin(), inStrides_1.end()); - i_inLengths_2.assign(inLengths_2.begin(), inLengths_2.end()); - i_inStrides_2.assign(inStrides_2.begin(), inStrides_2.end()); - i_outLengths.assign(outLengths.begin(), outLengths.end()); - i_outStrides.assign(outStrides.begin(), outStrides.end()); + std::copy(inLengths_1.begin(), inLengths_1.end(), arrInLengths_1.begin()); + std::copy(inStrides_1.begin(), inStrides_1.end(), arrInStrides_1.begin()); + std::copy(inLengths_2.begin(), inLengths_2.end(), arrInLengths_2.begin()); + std::copy(inStrides_2.begin(), inStrides_2.end(), arrInStrides_2.begin()); + std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); + std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); auto reduce_1 = DeviceReduceInstance_1{}; - auto argument_ptr_1 = reduce_1.MakeArgumentPointer(i_inLengths_1, - i_inStrides_1, - i_inLengths_2, - i_inStrides_2, + auto argument_ptr_1 = reduce_1.MakeArgumentPointer(arrInLengths_1, + arrInStrides_1, + arrInLengths_2, + arrInStrides_2, reduceDims_1, 1.0f, 0.0f, @@ -255,10 +255,10 @@ int main(int argc, char* argv[]) auto reduce_2 = DeviceReduceInstance_2{}; - auto argument_ptr_2 = reduce_2.MakeArgumentPointer(i_inLengths_2, - i_inStrides_2, - i_outLengths, - i_outStrides, + auto argument_ptr_2 = reduce_2.MakeArgumentPointer(arrInLengths_2, + arrInStrides_2, + arrOutLengths, + arrOutStrides, reduceDims_2, alpha, beta, diff --git a/example/12_reduce/reduce_example_common.hpp b/example/12_reduce/reduce_example_common.hpp index 6334f608e3..05f0a0edb2 100644 --- a/example/12_reduce/reduce_example_common.hpp +++ b/example/12_reduce/reduce_example_common.hpp @@ -5,11 +5,10 @@ #include "ck/ck.hpp" -template -std::vector get_invariant_dims(const std::vector& reduceDims) +template +static inline std::array +get_invariant_dims(const std::array& reduceDims) { - assert(NumReduceDim == reduceDims.size()); - int reduceFlag = 0; // flag the bits for the reduceDims @@ -18,13 +17,15 @@ std::vector get_invariant_dims(const std::vector& reduceDims) reduceFlag |= 1 << reduceDims[i]; }; - std::vector invariantDims; + std::array invariantDims; // collect invariant dimensions + int dim = 0; for(int i = 0; i < Rank; i++) if((reduceFlag & (1 << i)) == 0) { - invariantDims.push_back(i); + invariantDims[dim] = i; + dim++; }; return invariantDims; diff --git a/example/12_reduce/reduce_multiblock_atomic_add.cpp b/example/12_reduce/reduce_multiblock_atomic_add.cpp index 9b56598ca3..90c04855b4 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add.cpp +++ b/example/12_reduce/reduce_multiblock_atomic_add.cpp @@ -138,13 +138,17 @@ bool reduce_multiblock_atomic_add_test(bool do_verification, if(ShapeType::Rank_ != inLengths.size() || ShapeType::NumReduceDim_ != reduceDims.size()) return; + std::array a_reduceDims; + + std::copy(reduceDims.begin(), reduceDims.end(), a_reduceDims.begin()); + result = reduce_multiblock_atomic_add_impl( - do_verification, init_method, time_kernel, inLengths, reduceDims, alpha, beta); + do_verification, init_method, time_kernel, inLengths, a_reduceDims, alpha, beta); matched = true; }); diff --git a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp index b678546730..0a5355f337 100644 --- a/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp +++ b/example/12_reduce/reduce_multiblock_atomic_add_impl.hpp @@ -29,7 +29,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, int init_method, bool time_kernel, const std::vector& inLengths, - const std::vector& reduceDims, + const std::array& reduceDims, float alpha, float beta) @@ -37,6 +37,8 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, using namespace ck; using namespace ck::tensor_operation::device; + constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim; + constexpr bool op_support_atomic_add = (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG); @@ -84,7 +86,7 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, std::vector outLengths; - std::vector invariantDims = get_invariant_dims(reduceDims); + auto invariantDims = get_invariant_dims(reduceDims); if(invariantDims.empty()) outLengths.push_back(1); @@ -169,22 +171,22 @@ int reduce_multiblock_atomic_add_impl(bool do_verification, acc_elementwise_op); }; - std::vector i_inLengths; - std::vector i_inStrides; - std::vector i_outLengths; - std::vector i_outStrides; + std::array arrInLengths; + std::array arrInStrides; + std::array arrOutLengths; + std::array arrOutStrides; - i_inLengths.assign(inLengths.begin(), inLengths.end()); - i_inStrides.assign(inStrides.begin(), inStrides.end()); - i_outLengths.assign(outLengths.begin(), outLengths.end()); - i_outStrides.assign(outStrides.begin(), outStrides.end()); + std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin()); + std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin()); + std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); + std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); auto reduce = DeviceReduceInstance{}; - auto argument_ptr = reduce.MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, + auto argument_ptr = reduce.MakeArgumentPointer(arrInLengths, + arrInStrides, + arrOutLengths, + arrOutStrides, reduceDims, alpha, beta, diff --git a/include/ck/tensor_operation/gpu/device/device_reduce.hpp b/include/ck/tensor_operation/gpu/device/device_reduce.hpp index 468d0b5ab9..15aeb8e91c 100644 --- a/include/ck/tensor_operation/gpu/device/device_reduce.hpp +++ b/include/ck/tensor_operation/gpu/device/device_reduce.hpp @@ -3,27 +3,30 @@ #pragma once -#include +#include #include -#include -#include "ck/utility/common_header.hpp" -#include "ck/utility/reduction_enums.hpp" +#include "ck/ck.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" namespace ck { namespace tensor_operation { namespace device { -template +template struct DeviceReduce : public BaseOperator { + static constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim; + virtual std::unique_ptr - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + MakeArgumentPointer(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const void* in_dev, @@ -36,9 +39,12 @@ struct DeviceReduce : public BaseOperator virtual std::unique_ptr MakeInvokerPointer() = 0; }; -template -using DeviceReducePtr = - std::unique_ptr>; +template +using DeviceReducePtr = std::unique_ptr< + DeviceReduce>; } // namespace device } // namespace tensor_operation diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp index da53841cc3..0ccac7c746 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp @@ -5,9 +5,8 @@ #include #include +#include -#include "ck/utility/common_header.hpp" -#include "ck/utility/reduction_operator.hpp" #include "ck/tensor_description/tensor_descriptor.hpp" #include "ck/tensor_description/tensor_descriptor_helper.hpp" #include "ck/tensor_operation/gpu/device/device_reduce.hpp" @@ -41,7 +40,8 @@ template -struct DeviceReduceMultiBlock : public DeviceReduce +struct DeviceReduceMultiBlock + : public DeviceReduce { static_assert(Rank <= 6, "Bigger Rank size is not supported!"); static_assert(BlockSize == MThreadClusterSize * KThreadClusterSize, @@ -58,8 +58,8 @@ struct DeviceReduceMultiBlock : public DeviceReduce& inLengths, - const std::vector& inStrides, + static auto MakeSrc2dDescriptor(const std::array& inLengths, + const std::array& inStrides, int blkGroupSize, int numBlockTileIteration) { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + const auto tupleSrcLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + const auto tupleSrcStrides = + generate_tuple([&](auto I) { return inStrides[I]; }, Number{}); const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); @@ -97,7 +99,7 @@ struct DeviceReduceMultiBlock : public DeviceReduce::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumSrcDim, 1>::type{}), make_tuple(Sequence<0>{})); return transform_tensor_descriptor(one_dim_inDesc, @@ -111,10 +113,10 @@ struct DeviceReduceMultiBlock : public DeviceReduce::type; using ReduceDims = typename arithmetic_sequence_gen::type; - const auto reduceDimLengths = - make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); + const auto reduceDimLengths = generate_tuple( + [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number{}); const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); return transform_tensor_descriptor( inDesc, @@ -143,18 +145,20 @@ struct DeviceReduceMultiBlock : public DeviceReduce& outLengths, - const std::vector& outStrides) + static auto MakeDst1dDescriptor(const std::array& outLengths, + const std::array& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = + generate_tuple([&](auto I) { return outLengths[I]; }, Number{}); + const auto tupleDstStrides = + generate_tuple([&](auto I) { return outStrides[I]; }, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumDstDim, 1>::type{}), make_tuple(Sequence<0>{})); const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); @@ -170,18 +174,20 @@ struct DeviceReduceMultiBlock : public DeviceReduce& outLengths, - const std::vector& outStrides) + static auto MakeDst1dDescriptorForBufferSet(const std::array& outLengths, + const std::array& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = + generate_tuple([&](auto I) { return outLengths[I]; }, Number{}); + const auto tupleDstStrides = + generate_tuple([&](auto I) { return outStrides[I]; }, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumDstDim, 1>::type{}), make_tuple(Sequence<0>{})); const auto length = out_grid_desc_m.GetLength(Number<0>{}); @@ -198,11 +204,11 @@ struct DeviceReduceMultiBlock : public DeviceReduce inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + Argument(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const InDataType* in_dev, @@ -272,10 +278,10 @@ struct DeviceReduceMultiBlock : public DeviceReduce inLengths_; - std::vector inStrides_; - std::vector outLengths_; - std::vector outStrides_; + std::array inLengths_; + std::array inStrides_; + std::array outLengths_; + std::array outStrides_; AccDataType alpha_; AccDataType beta_; @@ -459,11 +465,11 @@ struct DeviceReduceMultiBlock : public DeviceReduce - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + MakeArgumentPointer(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const void* in_dev, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp index f958a7e673..05e14f080e 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp @@ -5,6 +5,7 @@ #include #include +#include #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -34,7 +35,8 @@ template -struct DeviceReduceThreadWise : public DeviceReduce +struct DeviceReduceThreadWise + : public DeviceReduce { static_assert(Rank <= 6, "Bigger Rank size is not supported!"); @@ -49,18 +51,20 @@ struct DeviceReduceThreadWise : public DeviceReduce& inLengths, - const std::vector& inStrides) + static auto MakeSrc2dDescriptor(const std::array& inLengths, + const std::array& inStrides) { - const auto tupleSrcLengths = make_tuple_from_array(inLengths, Number{}); - const auto tupleSrcStrides = make_tuple_from_array(inStrides, Number{}); + const auto tupleSrcLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + const auto tupleSrcStrides = + generate_tuple([&](auto I) { return inStrides[I]; }, Number{}); const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); @@ -70,7 +74,7 @@ struct DeviceReduceThreadWise : public DeviceReduce::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumSrcDim, 1>::type{}), make_tuple(Sequence<0>{})); return transform_tensor_descriptor(one_dim_inDesc, @@ -84,10 +88,10 @@ struct DeviceReduceThreadWise : public DeviceReduce::type; using ReduceDims = typename arithmetic_sequence_gen::type; - const auto reduceDimLengths = - make_tuple_from_array_and_index_seq(inLengths, ReduceDims{}); + const auto reduceDimLengths = generate_tuple( + [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number{}); const auto invariantDimLengths = - make_tuple_from_array_and_index_seq(inLengths, InvariantDims{}); + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); return transform_tensor_descriptor( inDesc, @@ -116,18 +120,20 @@ struct DeviceReduceThreadWise : public DeviceReduce& outLengths, - const std::vector& outStrides) + static auto MakeDst1dDescriptor(const std::array& outLengths, + const std::array& outStrides) { - const auto tupleDstLengths = make_tuple_from_array(outLengths, Number{}); - const auto tupleDstStrides = make_tuple_from_array(outStrides, Number{}); + const auto tupleDstLengths = + generate_tuple([&](auto I) { return outLengths[I]; }, Number{}); + const auto tupleDstStrides = + generate_tuple([&](auto I) { return outStrides[I]; }, Number{}); auto outDesc = make_naive_tensor_descriptor(tupleDstLengths, tupleDstStrides); auto out_grid_desc_m = transform_tensor_descriptor( outDesc, make_tuple(make_merge_transform(tupleDstLengths)), - make_tuple(typename arithmetic_sequence_gen<0, numDstDim, 1>::type{}), + make_tuple(typename arithmetic_sequence_gen<0, NumDstDim, 1>::type{}), make_tuple(Sequence<0>{})); const auto invariantLength = out_grid_desc_m.GetLength(Number<0>{}); @@ -145,11 +151,11 @@ struct DeviceReduceThreadWise : public DeviceReduce inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + Argument(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const InDataType* in_dev, @@ -187,10 +193,10 @@ struct DeviceReduceThreadWise : public DeviceReduce inLengths_; - std::vector inStrides_; - std::vector outLengths_; - std::vector outStrides_; + std::array inLengths_; + std::array inStrides_; + std::array outLengths_; + std::array outStrides_; AccDataType alpha_; AccDataType beta_; @@ -321,11 +327,11 @@ struct DeviceReduceThreadWise : public DeviceReduce - MakeArgumentPointer(const std::vector inLengths, - const std::vector inStrides, - const std::vector outLengths, - const std::vector outStrides, - const std::vector reduceDims, + MakeArgumentPointer(const std::array inLengths, + const std::array inStrides, + const std::array outLengths, + const std::array outStrides, + const std::array reduceDims, float alpha, float beta, const void* in_dev, diff --git a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp index 17f8d13d27..fba820578b 100644 --- a/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp +++ b/include/ck/tensor_operation/gpu/device/impl/device_softmax_impl.hpp @@ -8,12 +8,9 @@ #include "ck/utility/reduction_operator.hpp" #include "ck/tensor_operation/gpu/device/device_base.hpp" -#include "ck/tensor_operation/gpu/device/device_reduce.hpp" #include "ck/tensor_operation/gpu/device/device_softmax.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_common.hpp" -#include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" #include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp" -#include "ck/tensor_operation/gpu/grid/gridwise_set_buffer_value.hpp" #include "ck/host_utility/device_prop.hpp" #include "ck/host_utility/kernel_launch.hpp" @@ -50,29 +47,80 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax; // OutDstVectorSize + static constexpr index_t NumInvariantDim = Rank - NumReduceDim; - using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1)); + static constexpr index_t NumSrcDim = Rank; + static constexpr index_t NumDstDim = (NumInvariantDim == 0) ? 1 : NumInvariantDim; + static constexpr bool reduceAllDim = (NumInvariantDim == 0); + + static constexpr index_t M_BlockTileSize = MThreadClusterSize * MThreadSliceSize; + static constexpr index_t K_BlockTileSize = KThreadClusterSize * KThreadSliceSize; + + static auto MakeSrc2dDescriptor(const std::vector& inLengths, + const std::vector& inStrides, + int blkGroupSize, + int numBlockTileIteration) + { + const auto tupleSrcLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + const auto tupleSrcStrides = + generate_tuple([&](auto I) { return inStrides[I]; }, Number{}); + + const auto inDesc = make_naive_tensor_descriptor(tupleSrcLengths, tupleSrcStrides); + + const auto in_grid_desc_m_k = [&]() { + if constexpr(reduceAllDim) + { + const auto one_dim_inDesc = transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(tupleSrcLengths)), + make_tuple(typename arithmetic_sequence_gen<0, NumSrcDim, 1>::type{}), + make_tuple(Sequence<0>{})); + + return transform_tensor_descriptor(one_dim_inDesc, + make_tuple(make_unmerge_transform(make_tuple( + 1, one_dim_inDesc.GetLength(Number<0>{})))), + make_tuple(Sequence<0>{}), + make_tuple(Sequence<0, 1>{})); + } + else + { + using InvariantDims = typename arithmetic_sequence_gen<0, NumInvariantDim, 1>::type; + using ReduceDims = typename arithmetic_sequence_gen::type; + + const auto reduceDimLengths = generate_tuple( + [&](auto I) { return inLengths[NumInvariantDim + I]; }, Number{}); + const auto invariantDimLengths = + generate_tuple([&](auto I) { return inLengths[I]; }, Number{}); + + return transform_tensor_descriptor( + inDesc, + make_tuple(make_merge_transform(invariantDimLengths), + make_merge_transform(reduceDimLengths)), + make_tuple(InvariantDims{}, ReduceDims{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + } + }(); + + const auto invariantLength = in_grid_desc_m_k.GetLength(Number<0>{}); + const auto reduceLength = in_grid_desc_m_k.GetLength(Number<1>{}); + + const int reduceSizePerBlock = K_BlockTileSize * numBlockTileIteration; + const auto inPad_M = + math::integer_least_multiple(invariantLength, M_BlockTileSize) - invariantLength; + const auto inPad_K = reduceSizePerBlock * blkGroupSize - reduceLength; + + auto in_grid_desc_m_k_padded = transform_tensor_descriptor( + in_grid_desc_m_k, + make_tuple(make_right_pad_transform(invariantLength, inPad_M), + make_right_pad_transform(reduceLength, inPad_K)), + make_tuple(Sequence<0>{}, Sequence<1>{}), + make_tuple(Sequence<0>{}, Sequence<1>{})); + + return (in_grid_desc_m_k_padded); + }; + + using GridDesc_M_K = decltype(MakeSrc2dDescriptor({1}, {1}, 1, 1)); using GridwiseSoftmaxGeneric = GridwiseSoftmax_mk_to_mk; - struct Argument : public Reduction::Argument + struct Argument : public BaseArgument { Argument(const std::vector inLengths, const std::vector inStrides, @@ -113,42 +161,60 @@ struct DeviceSoftmaxImpl : public DeviceSoftmax; diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp index 97e9addfb9..550a7b0345 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance.hpp @@ -3,24 +3,77 @@ #pragma once -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp" -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp" diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp index fa76526c53..90cfe837df 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp @@ -5,6 +5,8 @@ #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp" namespace ck { @@ -63,33 +65,20 @@ using reduce_configuration_2_instances_blockwise = std::tuple< >; #endif -template -using deviceReduceBlockWisePtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; - template + bool OutputIndex> void add_device_reduce_instance_blockwise( - std::vector>& device_op_instances) + std::vector>& + device_op_instances) { - using ReduceOperation = typename reduce_binary_operator::opType; - using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; - using AccElementwiseOperation = - typename reduce_unary_operator::AccElementwiseOperation; - - constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || - ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool OutputIndex = Indexable && UseIndex; - static_for<0, std::tuple_size::value, 1>{}( [&](auto i) { using cfg1 = remove_cvref_t( \ - std::vector> & device_op_instances) - -#define ADD_BLOCKWISE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -#define ADD_BLOCKWISE_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_blockwise( \ - std::vector> & device_op_instances) - -#define ADD_BLOCKWISE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_BLOCKWISE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp deleted file mode 100644 index 8d1fed046a..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp new file mode 100644 index 0000000000..521d93e600 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp new file mode 100644 index 0000000000..fe3fd6c0a7 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp new file mode 100644 index 0000000000..52a2b69cdd --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp new file mode 100644 index 0000000000..ee4fee41ea --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp new file mode 100644 index 0000000000..3abdb7f958 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp new file mode 100644 index 0000000000..b0dbcf31dd --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp deleted file mode 100644 index ae7f13ce97..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp new file mode 100644 index 0000000000..7bbf3df0a3 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp new file mode 100644 index 0000000000..559f322261 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp new file mode 100644 index 0000000000..28c9610789 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp deleted file mode 100644 index c26e136593..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp new file mode 100644 index 0000000000..5080d28636 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp new file mode 100644 index 0000000000..0d24d15371 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp new file mode 100644 index 0000000000..c806e807c8 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp deleted file mode 100644 index 30064d588d..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp new file mode 100644 index 0000000000..b7c046e751 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp new file mode 100644 index 0000000000..771bec1c95 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp new file mode 100644 index 0000000000..c1fe8addba --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp new file mode 100644 index 0000000000..6bc0662fea --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp new file mode 100644 index 0000000000..6f8005132d --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp new file mode 100644 index 0000000000..c771ac4fab --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp deleted file mode 100644 index c9f6a1a5ff..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp new file mode 100644 index 0000000000..b9ddbb9aea --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp new file mode 100644 index 0000000000..390a719ceb --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp new file mode 100644 index 0000000000..2a9ddbc61b --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp deleted file mode 100644 index c598e64cde..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp new file mode 100644 index 0000000000..5746884442 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp new file mode 100644 index 0000000000..ad0f2357e0 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp new file mode 100644 index 0000000000..c7d9527638 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp new file mode 100644 index 0000000000..ec56229937 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp new file mode 100644 index 0000000000..48f66da659 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp new file mode 100644 index 0000000000..fabfa5b4c6 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp deleted file mode 100644 index cd15949929..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp new file mode 100644 index 0000000000..e08faec200 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp new file mode 100644 index 0000000000..a1e692aae3 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp deleted file mode 100644 index bf62f92ad8..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp new file mode 100644 index 0000000000..e9654e8cce --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp new file mode 100644 index 0000000000..7824421309 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp new file mode 100644 index 0000000000..df323d40b3 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +extern template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp index 9fc409a08e..8c08e5ef2f 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp @@ -3,6 +3,9 @@ #pragma once +#include "ck/utility/reduction_operator.hpp" +#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp" + namespace ck { namespace tensor_operation { namespace device { @@ -32,6 +35,16 @@ struct ReductionConfiguration_2 static constexpr int KThreadSliceSize_ = KThreadSliceSize; }; +using ReduceAdd = ck::reduce::Add; +using ReduceMin = ck::reduce::Min; +using ReduceMax = ck::reduce::Max; +using ReduceAMax = ck::reduce::AMax; + +using UnarySquare = ck::tensor_operation::element_wise::UnarySquare; +using UnarySqrt = ck::tensor_operation::element_wise::UnarySqrt; +using UnaryDivide = ck::tensor_operation::element_wise::UnaryDivide; +using UnaryAbs = ck::tensor_operation::element_wise::UnaryAbs; + #define QUICK_REDUCE_TEST 1 } // namespace instance diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp index a4c17368f1..acf55d0683 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp @@ -6,6 +6,7 @@ #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_multiblock.hpp" +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp" namespace ck { @@ -64,135 +65,58 @@ using reduce_configuration_2_instances_multiblock_atomic_add = std::tuple< >; #endif -template -using deviceReduceMultiBlockAtomicAddPtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; - template + bool OutputIndex> void add_device_reduce_instance_multiblock_atomic_add( - std::vector>& device_op_instances) + std::vector>& + device_op_instances) { - using ReduceOperation = typename reduce_binary_operator::opType; - using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; - using AccElementwiseOperation = - typename reduce_unary_operator::AccElementwiseOperation; + static_for<0, + std::tuple_size::value, + 1>{}([&](auto i) { + using cfg1 = remove_cvref_t(reduce_configuration_1_instances_multiblock_atomic_add{}))>; - constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || - ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool OutputIndex = Indexable && UseIndex; - - static_assert(UseIndex == false, - "AtomicAdd can only be used with reduction operations using no index!"); - - constexpr bool op_acceptable = - (ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::MUL || - ReduceOpId == ReduceTensorOp::AVG || ReduceOpId == ReduceTensorOp::NORM1); - - constexpr bool out_type_acceptable = - (std::is_same::value || std::is_same::value); - - if constexpr(!op_acceptable || !out_type_acceptable) - return; - else - { static_for<0, - std::tuple_size::value, - 1>{}([&](auto i) { - using cfg1 = remove_cvref_t(reduce_configuration_1_instances_multiblock_atomic_add{}))>; + std::tuple_size::value, + 1>{}([&](auto j) { + using cfg2 = remove_cvref_t(reduce_configuration_2_instances_multiblock_atomic_add{}))>; - static_for< - 0, - std::tuple_size::value, - 1>{}([&](auto j) { - using cfg2 = remove_cvref_t(reduce_configuration_2_instances_multiblock_atomic_add{}))>; + using ReduceOpInstance = DeviceReduceMultiBlock; - using ReduceOpInstance = - DeviceReduceMultiBlock; - - device_op_instances.push_back( - std::make_unique(ReduceOpInstance{})); - }); + device_op_instances.push_back(std::make_unique(ReduceOpInstance{})); }); - } + }); }; -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - template void add_device_reduce_instance_multiblock_atomic_add( \ - std::vector> & device_op_instances) - -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_multiblock_atomic_add( \ - std::vector> & device_op_instances) - -#define ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp deleted file mode 100644 index 3efc585068..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp new file mode 100644 index 0000000000..f5102f4977 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp new file mode 100644 index 0000000000..ec513113d9 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp deleted file mode 100644 index 804cba12cc..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp new file mode 100644 index 0000000000..3a3d53b8c6 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp new file mode 100644 index 0000000000..bbf4398964 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp deleted file mode 100644 index 32eb843a1c..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp new file mode 100644 index 0000000000..55147a60e5 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp new file mode 100644 index 0000000000..4bff06c6af --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp deleted file mode 100644 index 9f2a892475..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp new file mode 100644 index 0000000000..daffa1aa4d --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.hpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp new file mode 100644 index 0000000000..52c4171123 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.hpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp deleted file mode 100644 index bd20069992..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp new file mode 100644 index 0000000000..2f358b06e0 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp new file mode 100644 index 0000000000..84c99dcc57 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +extern template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp index e09fd688d2..dfcc8dd854 100644 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp @@ -5,6 +5,8 @@ #include "ck/tensor_operation/gpu/device/reduction_operator_mapping.hpp" #include "ck/tensor_operation/gpu/device/impl/device_reduce_threadwise.hpp" + +#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp" #include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_impl_common.hpp" namespace ck { @@ -49,33 +51,20 @@ using reduce_configuration_2_instances_threadwise = std::tuple< >; #endif -template -using deviceReduceThreadWisePtrType = DeviceReducePtr< - typename reduce_unary_operator::InElementwiseOperation, - typename reduce_unary_operator::AccElementwiseOperation>; - template + bool OutputIndex> void add_device_reduce_instance_threadwise( - std::vector>& device_op_instances) + std::vector>& + device_op_instances) { - using ReduceOperation = typename reduce_binary_operator::opType; - using InElementwiseOperation = - typename reduce_unary_operator::InElementwiseOperation; - using AccElementwiseOperation = - typename reduce_unary_operator::AccElementwiseOperation; - - constexpr bool Indexable = - (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || - ReduceOpId == ReduceTensorOp::AMAX); - constexpr bool OutputIndex = Indexable && UseIndex; - using cfg1 = ReductionConfiguration_1<256, 256, 1>; static_for<0, std::tuple_size::value, 1>{}( @@ -89,8 +78,8 @@ void add_device_reduce_instance_threadwise( Rank, NumReduceDim, ReduceOperation, - InElementwiseOperation, - AccElementwiseOperation, + InElementwiseOp, + AccElementwiseOp, PropagateNan, OutputIndex, false, // HaveIndexInputIfOutputIndex @@ -105,52 +94,6 @@ void add_device_reduce_instance_threadwise( }); }; -#define ADD_THREADWISE_INST_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - template void add_device_reduce_instance_threadwise( \ - std::vector> & device_op_instances) - -#define ADD_THREADWISE_INST_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_THREADWISE_INST_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - -#define ADD_THREADWISE_INST_REF_BY_TYPE( \ - inT, compT, outT, ReduceOpId, PropagateNan, UseIndex, Rank, NumReduceDim) \ - extern template void add_device_reduce_instance_threadwise( \ - std::vector> & device_op_instances) - -#define ADD_THREADWISE_INST_REF_BY_ID( \ - inT, compT, outT, ReduceOpId, NanOpt, IndicesOpt, Rank, NumReduceDim) \ - ADD_THREADWISE_INST_REF_BY_TYPE(inT, \ - compT, \ - outT, \ - static_cast(ReduceOpId), \ - static_cast(NanOpt), \ - static_cast(IndicesOpt), \ - Rank, \ - NumReduceDim) - } // namespace instance } // namespace device } // namespace tensor_operation diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp deleted file mode 100644 index 5f7f5c7af5..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.hpp +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp new file mode 100644 index 0000000000..4168508b28 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp new file mode 100644 index 0000000000..317006e3a5 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp new file mode 100644 index 0000000000..fc7718ddc0 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp new file mode 100644 index 0000000000..e6616386ca --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp new file mode 100644 index 0000000000..a9441b8e8e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp new file mode 100644 index 0000000000..6820ace8cf --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp deleted file mode 100644 index 3c21b408cc..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp new file mode 100644 index 0000000000..ab3d4e6e2c --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp new file mode 100644 index 0000000000..ee08c9635b --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp new file mode 100644 index 0000000000..1007ca27bb --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp deleted file mode 100644 index cd116986d9..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp new file mode 100644 index 0000000000..1d562c4999 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp new file mode 100644 index 0000000000..5aac638b1e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp new file mode 100644 index 0000000000..7a3c764097 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp deleted file mode 100644 index a764735fa9..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp new file mode 100644 index 0000000000..4685d7b5d5 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp new file mode 100644 index 0000000000..1de338fb48 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp new file mode 100644 index 0000000000..e86c41a949 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp new file mode 100644 index 0000000000..2ca9008560 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp new file mode 100644 index 0000000000..38380e71ec --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp new file mode 100644 index 0000000000..04c5f3e658 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp deleted file mode 100644 index 7d47c79f84..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.hpp +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp new file mode 100644 index 0000000000..fef5d40884 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp new file mode 100644 index 0000000000..2416f614c3 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp new file mode 100644 index 0000000000..fbd0285ae8 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp deleted file mode 100644 index faced808a2..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.hpp +++ /dev/null @@ -1,58 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp new file mode 100644 index 0000000000..103b85a011 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp new file mode 100644 index 0000000000..e01f590f0e --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp new file mode 100644 index 0000000000..14a7459bb8 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp new file mode 100644 index 0000000000..7dfd806012 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp new file mode 100644 index 0000000000..7670a27c84 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp new file mode 100644 index 0000000000..8bb85f3779 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp deleted file mode 100644 index 111ba7a0cf..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.hpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp new file mode 100644 index 0000000000..a005ba8d42 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp new file mode 100644 index 0000000000..9e8c07eb4f --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.hpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp deleted file mode 100644 index c771f057d6..0000000000 --- a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#pragma once - -#include "ck/utility/data_type.hpp" - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp new file mode 100644 index 0000000000..a69f88f5a9 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp new file mode 100644 index 0000000000..734b31c1e9 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp new file mode 100644 index 0000000000..237bd96966 --- /dev/null +++ b/library/include/ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.hpp @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#pragma once + +#include "ck/utility/data_type.hpp" +#include "ck/utility/reduction_enums.hpp" + +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +extern template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/include/ck/library/utility/host_reduction.hpp b/library/include/ck/library/utility/host_reduction.hpp index f02ebcd79a..7c0c969ac5 100644 --- a/library/include/ck/library/utility/host_reduction.hpp +++ b/library/include/ck/library/utility/host_reduction.hpp @@ -96,10 +96,9 @@ struct ReductionHost static constexpr int NumInvariantDim = Rank - NumReduceDim; std::vector outStrides; - std::vector invariantDims; - std::vector reduceDims; IndexDataType divider; + std::array reduceLengths; std::array reduceStrides; std::array invariantLengths; @@ -110,15 +109,12 @@ struct ReductionHost ReductionHost(HostTensorDescriptor& inDesc, HostTensorDescriptor& outDesc, - const std::vector& invariantDims_, - const std::vector& reduceDims_) + const std::array invariantDims, + const std::array reduceDims) { // this->outLengths = to_int_vector(outDesc.GetLengths()); this->outStrides = outDesc.GetStrides(); - this->invariantDims = invariantDims_; - this->reduceDims = reduceDims_; - int product = 1; for(int i = 0; i < NumReduceDim; i++) diff --git a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt index 4eddd6b644..31ae7226f4 100644 --- a/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/reduce/CMakeLists.txt @@ -1,23 +1,76 @@ add_instance_library(device_reduce_instance - device_reduce_instance_blockwise_f16_f16_f16.cpp - device_reduce_instance_blockwise_f16_f32_f16.cpp - device_reduce_instance_blockwise_f32_f32_f32.cpp - device_reduce_instance_blockwise_f32_f64_f32.cpp - device_reduce_instance_blockwise_f64_f64_f64.cpp - device_reduce_instance_blockwise_i8_i32_i8.cpp - device_reduce_instance_blockwise_i8_i8_i8.cpp - device_reduce_instance_blockwise_b16_f32_b16.cpp - device_reduce_instance_threadwise_f16_f16_f16.cpp - device_reduce_instance_threadwise_f16_f32_f16.cpp - device_reduce_instance_threadwise_f32_f32_f32.cpp - device_reduce_instance_threadwise_f32_f64_f32.cpp - device_reduce_instance_threadwise_f64_f64_f64.cpp - device_reduce_instance_threadwise_i8_i32_i8.cpp - device_reduce_instance_threadwise_i8_i8_i8.cpp - device_reduce_instance_threadwise_b16_f32_b16.cpp - device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp - device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp - device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp - device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp - device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp + device_reduce_instance_blockwise_f16_f16_f16_min.cpp + device_reduce_instance_blockwise_f16_f16_f16_max.cpp + device_reduce_instance_blockwise_f16_f16_f16_amax.cpp + device_reduce_instance_blockwise_f16_f32_f16_add.cpp + device_reduce_instance_blockwise_f16_f32_f16_avg.cpp + device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp + device_reduce_instance_blockwise_f32_f32_f32_add.cpp + device_reduce_instance_blockwise_f32_f32_f32_avg.cpp + device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp + device_reduce_instance_blockwise_f32_f32_f32_min.cpp + device_reduce_instance_blockwise_f32_f32_f32_max.cpp + device_reduce_instance_blockwise_f32_f32_f32_amax.cpp + device_reduce_instance_blockwise_f32_f64_f32_add.cpp + device_reduce_instance_blockwise_f32_f64_f32_avg.cpp + device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp + device_reduce_instance_blockwise_f64_f64_f64_add.cpp + device_reduce_instance_blockwise_f64_f64_f64_avg.cpp + device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp + device_reduce_instance_blockwise_f64_f64_f64_min.cpp + device_reduce_instance_blockwise_f64_f64_f64_max.cpp + device_reduce_instance_blockwise_f64_f64_f64_amax.cpp + device_reduce_instance_blockwise_i8_i32_i8_add.cpp + device_reduce_instance_blockwise_i8_i32_i8_avg.cpp + device_reduce_instance_blockwise_i8_i8_i8_min.cpp + device_reduce_instance_blockwise_i8_i8_i8_max.cpp + device_reduce_instance_blockwise_i8_i8_i8_amax.cpp + device_reduce_instance_blockwise_b16_f32_b16_add.cpp + device_reduce_instance_blockwise_b16_f32_b16_avg.cpp + device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp + device_reduce_instance_blockwise_b16_f32_b16_min.cpp + device_reduce_instance_blockwise_b16_f32_b16_max.cpp + device_reduce_instance_blockwise_b16_f32_b16_amax.cpp + device_reduce_instance_threadwise_f16_f16_f16_min.cpp + device_reduce_instance_threadwise_f16_f16_f16_max.cpp + device_reduce_instance_threadwise_f16_f16_f16_amax.cpp + device_reduce_instance_threadwise_f16_f32_f16_add.cpp + device_reduce_instance_threadwise_f16_f32_f16_avg.cpp + device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp + device_reduce_instance_threadwise_f32_f32_f32_add.cpp + device_reduce_instance_threadwise_f32_f32_f32_avg.cpp + device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp + device_reduce_instance_threadwise_f32_f32_f32_min.cpp + device_reduce_instance_threadwise_f32_f32_f32_max.cpp + device_reduce_instance_threadwise_f32_f32_f32_amax.cpp + device_reduce_instance_threadwise_f32_f64_f32_add.cpp + device_reduce_instance_threadwise_f32_f64_f32_avg.cpp + device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp + device_reduce_instance_threadwise_f64_f64_f64_add.cpp + device_reduce_instance_threadwise_f64_f64_f64_avg.cpp + device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp + device_reduce_instance_threadwise_f64_f64_f64_min.cpp + device_reduce_instance_threadwise_f64_f64_f64_max.cpp + device_reduce_instance_threadwise_f64_f64_f64_amax.cpp + device_reduce_instance_threadwise_i8_i32_i8_add.cpp + device_reduce_instance_threadwise_i8_i32_i8_avg.cpp + device_reduce_instance_threadwise_i8_i8_i8_min.cpp + device_reduce_instance_threadwise_i8_i8_i8_max.cpp + device_reduce_instance_threadwise_i8_i8_i8_amax.cpp + device_reduce_instance_threadwise_b16_f32_b16_add.cpp + device_reduce_instance_threadwise_b16_f32_b16_avg.cpp + device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp + device_reduce_instance_threadwise_b16_f32_b16_min.cpp + device_reduce_instance_threadwise_b16_f32_b16_max.cpp + device_reduce_instance_threadwise_b16_f32_b16_amax.cpp + device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp + device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp + device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp + device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp + device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp + device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp + device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp + device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp + device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp + device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp ) diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp deleted file mode 100644 index c97efbc901..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp new file mode 100644 index 0000000000..1909183a55 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp new file mode 100644 index 0000000000..ec30201021 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp new file mode 100644 index 0000000000..89f3e58280 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp new file mode 100644 index 0000000000..f1bdd1927b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp new file mode 100644 index 0000000000..58e9c56229 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp new file mode 100644 index 0000000000..e5012c651a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_b16_f32_b16_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp deleted file mode 100644 index 5e73b3d8b9..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp new file mode 100644 index 0000000000..0970cb9d7c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp new file mode 100644 index 0000000000..6ee179a511 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp new file mode 100644 index 0000000000..e53b403065 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f16_f16_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp deleted file mode 100644 index 93d3e27016..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp new file mode 100644 index 0000000000..cab5738fba --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp new file mode 100644 index 0000000000..7d2a4fad2a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp new file mode 100644 index 0000000000..e08b64f8b3 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f16_f32_f16_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp deleted file mode 100644 index 38800ddde5..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp new file mode 100644 index 0000000000..89cabf3762 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp new file mode 100644 index 0000000000..1e602c121d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp new file mode 100644 index 0000000000..489b4bc452 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp new file mode 100644 index 0000000000..04e2c5b164 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp new file mode 100644 index 0000000000..5c0e536048 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp new file mode 100644 index 0000000000..899dfcd37c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f32_f32_norm2.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp deleted file mode 100644 index b821aeee0a..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp new file mode 100644 index 0000000000..5624337a47 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp new file mode 100644 index 0000000000..2f3067ce29 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp new file mode 100644 index 0000000000..2648e7d59d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f32_f64_f32_norm2.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp deleted file mode 100644 index 074d0cfdf7..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp new file mode 100644 index 0000000000..f67ae2ee7c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp new file mode 100644 index 0000000000..6f8e07851d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp new file mode 100644 index 0000000000..69fecf72f5 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp new file mode 100644 index 0000000000..129a4f0f0e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp new file mode 100644 index 0000000000..21babc4aa6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp new file mode 100644 index 0000000000..b85b3e2b68 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_f64_f64_f64_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp deleted file mode 100644 index e803fb842d..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp new file mode 100644 index 0000000000..24a8293b5d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp new file mode 100644 index 0000000000..73e60fa959 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i32_i8_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp deleted file mode 100644 index 4bf4139d28..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_BLOCKWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp new file mode 100644 index 0000000000..72e649d897 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp new file mode 100644 index 0000000000..a7e053a065 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp new file mode 100644 index 0000000000..0e3abd35b4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise_i8_i8_i8_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_blockwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +template void add_device_reduce_instance_blockwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp deleted file mode 100644 index a571655cdc..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(bhalf_t, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp new file mode 100644 index 0000000000..4b32456074 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp new file mode 100644 index 0000000000..3298587a42 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_b16_f32_f32_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp deleted file mode 100644 index 9ad9a630bd..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(half_t, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp new file mode 100644 index 0000000000..729d4fd6e1 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp new file mode 100644 index 0000000000..e3e36e312b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f16_f32_f32_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp deleted file mode 100644 index 4ee70702c0..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp new file mode 100644 index 0000000000..e7580e7d7d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp new file mode 100644 index 0000000000..1e6feb0071 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f32_f32_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp deleted file mode 100644 index 8c5fa80e81..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32.cpp +++ /dev/null @@ -1,26 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp new file mode 100644 index 0000000000..669c4d34ca --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp new file mode 100644 index 0000000000..335a5474ce --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f32_f64_f32_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp deleted file mode 100644 index d2b81c486d..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64.cpp +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_MULTIBLOCK_ATOMIC_ADD_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp new file mode 100644 index 0000000000..e95e8391a2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp new file mode 100644 index 0000000000..25498158a2 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add_f64_f64_f64_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_multiblock_atomic_add.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +template void add_device_reduce_instance_multiblock_atomic_add(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp deleted file mode 100644 index 8d678e784a..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16.cpp +++ /dev/null @@ -1,56 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1); - -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp new file mode 100644 index 0000000000..7262b8a5ba --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp new file mode 100644 index 0000000000..c526a74f1a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp new file mode 100644 index 0000000000..4c7252e742 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp new file mode 100644 index 0000000000..618900a7d7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp new file mode 100644 index 0000000000..ce747cbc76 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp new file mode 100644 index 0000000000..06f622b9e6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_b16_f32_b16_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp deleted file mode 100644 index 010560586a..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp new file mode 100644 index 0000000000..708eb58d40 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp new file mode 100644 index 0000000000..c8a62fa149 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp new file mode 100644 index 0000000000..ce2092153c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f16_f16_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp deleted file mode 100644 index 55c53dfd58..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16.cpp +++ /dev/null @@ -1,30 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp new file mode 100644 index 0000000000..29251a8b9a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp new file mode 100644 index 0000000000..734fa9fd3e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp new file mode 100644 index 0000000000..d7a0e2bfe8 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f16_f32_f16_norm2.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp deleted file mode 100644 index 367cf9a65d..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32.cpp +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 7, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, float, float, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp new file mode 100644 index 0000000000..8b97f3008b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp new file mode 100644 index 0000000000..53d01e38d6 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp new file mode 100644 index 0000000000..125d054f3d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp new file mode 100644 index 0000000000..fb86a2bbe4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp new file mode 100644 index 0000000000..49af08390a --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp new file mode 100644 index 0000000000..30cc1b13ec --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f32_f32_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp deleted file mode 100644 index 18fd08448c..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32.cpp +++ /dev/null @@ -1,31 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(float, double, float, 7, 0, 0, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp new file mode 100644 index 0000000000..24f8a9ba5c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_add.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp new file mode 100644 index 0000000000..a26702f053 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp new file mode 100644 index 0000000000..34fe32628f --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f32_f64_f32_norm2.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp deleted file mode 100644 index 3d02f3cbe3..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64.cpp +++ /dev/null @@ -1,54 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 5, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 -ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 7, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(double, double, double, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp new file mode 100644 index 0000000000..74b15eddba --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_add.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp new file mode 100644 index 0000000000..65762492f7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_amax.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp new file mode 100644 index 0000000000..5e74295a0d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_avg.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp new file mode 100644 index 0000000000..6fdea6cc4d --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_max.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp new file mode 100644 index 0000000000..317d573dac --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_min.cpp @@ -0,0 +1,27 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp new file mode 100644 index 0000000000..29f95ebcc7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_f64_f64_f64_norm2.cpp @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp deleted file mode 100644 index fcf072a086..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8.cpp +++ /dev/null @@ -1,28 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1); -// clang-format on -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp new file mode 100644 index 0000000000..aa9f47cbc4 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_add.cpp @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp new file mode 100644 index 0000000000..54a9dd1ab7 --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i32_i8_avg.cpp @@ -0,0 +1,24 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp deleted file mode 100644 index 85d7ce8b4c..0000000000 --- a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8.cpp +++ /dev/null @@ -1,43 +0,0 @@ -// SPDX-License-Identifier: MIT -// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. - -#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" - -namespace ck { -namespace tensor_operation { -namespace device { -namespace instance { - -// clang-format off -// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1); -ADD_THREADWISE_INST_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1); -// clang-format on - -} // namespace instance -} // namespace device -} // namespace tensor_operation - -} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp new file mode 100644 index 0000000000..4ef5717b5e --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_amax.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp new file mode 100644 index 0000000000..140a3c197b --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_max.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp new file mode 100644 index 0000000000..317b4ad39c --- /dev/null +++ b/library/src/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise_i8_i8_i8_min.cpp @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved. + +#include "ck/utility/reduction_enums.hpp" +#include "ck/library/tensor_operation_instance/gpu/reduce/device_reduce_instance_threadwise.hpp" + +namespace ck { +namespace tensor_operation { +namespace device { +namespace instance { + +// clang-format off +// InDataType | AccDataType | OutDataType | Rank | NumReduceDim | ReduceOperation | InElementwiseOp | AccElementwiseOp | PropagateNan | UseIndex +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +template void add_device_reduce_instance_threadwise(std::vector>&); +// clang-format on + +} // namespace instance +} // namespace device +} // namespace tensor_operation + +} // namespace ck diff --git a/profiler/include/profile_reduce_impl.hpp b/profiler/include/profile_reduce_impl.hpp index 2d06ec22c5..981962bdc5 100644 --- a/profiler/include/profile_reduce_impl.hpp +++ b/profiler/include/profile_reduce_impl.hpp @@ -18,57 +18,61 @@ namespace tensor_operation { namespace device { namespace instance { -template +template struct ReduceDescription { - static constexpr int Rank_ = Rank; - static constexpr int NumReduceDim_ = NumReduceDim; - static constexpr int ReduceOpId_ = ReduceOpId; - static constexpr int PropagateNan_ = PropagateNan; - static constexpr int UseIndex_ = UseIndex; + static constexpr index_t Rank_ = Rank; + static constexpr index_t NumReduceDim_ = NumReduceDim; + static constexpr ReduceTensorOp ReduceOpId_ = ReduceOpId; + static constexpr bool PropagateNan_ = PropagateNan; + static constexpr bool UseIndex_ = UseIndex; }; using reduce_description_instances = - std::tuple, // for ADD - ReduceDescription<4, 4, 0, false, false>, - ReduceDescription<4, 1, 0, false, false>, - ReduceDescription<2, 1, 0, false, false>, + std::tuple, // for ADD + ReduceDescription<4, 4, ReduceTensorOp::ADD, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::ADD, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::ADD, false, false>, - ReduceDescription<4, 3, 5, false, false>, // for AVG - ReduceDescription<4, 4, 5, false, false>, - ReduceDescription<4, 1, 5, false, false>, - ReduceDescription<2, 1, 5, false, false>, + ReduceDescription<4, 3, ReduceTensorOp::AVG, false, false>, // for AVG + ReduceDescription<4, 4, ReduceTensorOp::AVG, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::AVG, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::AVG, false, false>, - ReduceDescription<4, 3, 7, false, false>, // for NORM2 - ReduceDescription<4, 4, 7, false, false>, - ReduceDescription<4, 1, 7, false, false>, - ReduceDescription<2, 1, 7, false, false>, + ReduceDescription<4, 3, ReduceTensorOp::NORM2, false, false>, // for NORM2 + ReduceDescription<4, 4, ReduceTensorOp::NORM2, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::NORM2, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::NORM2, false, false>, - ReduceDescription<4, 3, 2, false, false>, // for MIN - ReduceDescription<4, 4, 2, false, false>, - ReduceDescription<4, 1, 2, false, false>, - ReduceDescription<2, 1, 2, false, false>, - ReduceDescription<4, 3, 3, false, false>, // for MAX - ReduceDescription<4, 4, 3, false, false>, - ReduceDescription<4, 1, 3, false, false>, - ReduceDescription<2, 1, 3, false, false>, - ReduceDescription<4, 3, 4, false, false>, // for AMAX - ReduceDescription<4, 4, 4, false, false>, - ReduceDescription<4, 1, 4, false, false>, - ReduceDescription<2, 1, 4, false, false>, + ReduceDescription<4, 3, ReduceTensorOp::MIN, false, false>, // for MIN + ReduceDescription<4, 4, ReduceTensorOp::MIN, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::MIN, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::MIN, false, false>, + ReduceDescription<4, 3, ReduceTensorOp::MAX, false, false>, // for MAX + ReduceDescription<4, 4, ReduceTensorOp::MAX, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::MAX, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::MAX, false, false>, + ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, false>, // for AMAX + ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, false>, + ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, false>, + ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, false>, - ReduceDescription<4, 3, 2, false, true>, // for MIN - ReduceDescription<4, 4, 2, false, true>, - ReduceDescription<4, 1, 2, false, true>, - ReduceDescription<2, 1, 2, false, true>, - ReduceDescription<4, 3, 3, false, true>, // for MAX - ReduceDescription<4, 4, 3, false, true>, - ReduceDescription<4, 1, 3, false, true>, - ReduceDescription<2, 1, 3, false, true>, - ReduceDescription<4, 3, 4, false, true>, // for AMAX - ReduceDescription<4, 4, 4, false, true>, - ReduceDescription<4, 1, 4, false, true>, - ReduceDescription<2, 1, 4, false, true>>; + ReduceDescription<4, 3, ReduceTensorOp::MIN, false, true>, // for MIN + ReduceDescription<4, 4, ReduceTensorOp::MIN, false, true>, + ReduceDescription<4, 1, ReduceTensorOp::MIN, false, true>, + ReduceDescription<2, 1, ReduceTensorOp::MIN, false, true>, + ReduceDescription<4, 3, ReduceTensorOp::MAX, false, true>, // for MAX + ReduceDescription<4, 4, ReduceTensorOp::MAX, false, true>, + ReduceDescription<4, 1, ReduceTensorOp::MAX, false, true>, + ReduceDescription<2, 1, ReduceTensorOp::MAX, false, true>, + ReduceDescription<4, 3, ReduceTensorOp::AMAX, false, true>, // for AMAX + ReduceDescription<4, 4, ReduceTensorOp::AMAX, false, true>, + ReduceDescription<4, 1, ReduceTensorOp::AMAX, false, true>, + ReduceDescription<2, 1, ReduceTensorOp::AMAX, false, true>>; template bool description_match(const DescriptionType& description, @@ -78,9 +82,8 @@ bool description_match(const DescriptionType& description, bool PropagateNan, bool UseIndex) { - if(description.Rank_ != Rank || description.ReduceOpId_ != static_cast(ReduceOpId) || - description.PropagateNan_ != static_cast(PropagateNan) || - description.UseIndex_ != static_cast(UseIndex)) + if(description.Rank_ != Rank || description.ReduceOpId_ != ReduceOpId || + description.PropagateNan_ != PropagateNan || description.UseIndex_ != UseIndex) return (false); if(DescriptionType::NumReduceDim_ != reduceDims.size()) @@ -99,11 +102,10 @@ bool description_match(const DescriptionType& description, namespace ck { namespace profiler { -template -static inline std::vector get_invariant_dims(const std::vector& reduceDims) +template +static inline std::array +get_invariant_dims(const std::array& reduceDims) { - assert(NumReduceDim == reduceDims.size()); - int reduceFlag = 0; // flag the bits for the reduceDims @@ -112,13 +114,15 @@ static inline std::vector get_invariant_dims(const std::vector& reduce reduceFlag |= 1 << reduceDims[i]; }; - std::vector invariantDims; + std::array invariantDims; // collect invariant dimensions + int dim = 0; for(int i = 0; i < Rank; i++) if((reduceFlag & (1 << i)) == 0) { - invariantDims.push_back(i); + invariantDims[dim] = i; + dim++; }; return invariantDims; @@ -137,7 +141,7 @@ bool profile_reduce_impl_impl(bool do_verification, bool do_dumpout, bool time_kernel, const std::vector& inLengths, - const std::vector& reduceDims, + const std::array& reduceDims, float alpha, float beta) { @@ -145,6 +149,8 @@ bool profile_reduce_impl_impl(bool do_verification, using namespace ck::tensor_operation::device::instance; using ck::host_common::dumpBufferToFile; + constexpr index_t NumOutDim = (Rank - NumReduceDim == 0) ? 1 : Rank - NumReduceDim; + constexpr bool op_support_indices = (ReduceOpId == ReduceTensorOp::MIN || ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX); @@ -279,28 +285,32 @@ bool profile_reduce_impl_impl(bool do_verification, reduce_unary_operator::GetElementwiseOperator( static_cast(reduce_total_length)); - using DeviceReduceInstPtr0 = - DeviceReducePtr; + using DeviceReduceInstPtr = + DeviceReducePtr; - std::vector reduce0_ptrs; + std::vector reduce_ptrs; add_device_reduce_instance_threadwise(reduce0_ptrs); + UseIndex>(reduce_ptrs); add_device_reduce_instance_blockwise(reduce0_ptrs); + UseIndex>(reduce_ptrs); if constexpr(use_atomic_add) { @@ -309,12 +319,14 @@ bool profile_reduce_impl_impl(bool do_verification, OutDataType, Rank, NumReduceDim, - ReduceOpId, + ReduceOperation, + InElementwiseOperation, + AccElementwiseOperation, PropagateNan, - UseIndex>(reduce0_ptrs); + UseIndex>(reduce_ptrs); } - if(reduce0_ptrs.empty()) + if(reduce_ptrs.empty()) { throw std::runtime_error("Wrong! No device REDUCE instance found"); }; @@ -342,22 +354,22 @@ bool profile_reduce_impl_impl(bool do_verification, acc_elementwise_op); }; - std::vector i_inLengths; - std::vector i_inStrides; - std::vector i_outLengths; - std::vector i_outStrides; + std::array arrInLengths; + std::array arrInStrides; + std::array arrOutLengths; + std::array arrOutStrides; - i_inLengths.assign(inLengths.begin(), inLengths.end()); - i_inStrides.assign(inStrides.begin(), inStrides.end()); - i_outLengths.assign(outLengths.begin(), outLengths.end()); - i_outStrides.assign(outStrides.begin(), outStrides.end()); + std::copy(inLengths.begin(), inLengths.end(), arrInLengths.begin()); + std::copy(inStrides.begin(), inStrides.end(), arrInStrides.begin()); + std::copy(outLengths.begin(), outLengths.end(), arrOutLengths.begin()); + std::copy(outStrides.begin(), outStrides.end(), arrOutStrides.begin()); - for(auto& reduce_ptr : reduce0_ptrs) + for(auto& reduce_ptr : reduce_ptrs) { - auto argument_ptr = reduce_ptr->MakeArgumentPointer(i_inLengths, - i_inStrides, - i_outLengths, - i_outStrides, + auto argument_ptr = reduce_ptr->MakeArgumentPointer(arrInLengths, + arrInStrides, + arrOutLengths, + arrOutStrides, reduceDims, alpha, beta, @@ -478,22 +490,25 @@ bool profile_reduce_impl(bool do_verification, descType{}, inLengths.size(), reduceDims, ReduceOpId, PropagateNan, UseIndex)) return; - pass = pass && - profile_reduce_impl_impl(descType::ReduceOpId_), - static_cast(descType::PropagateNan_), - static_cast(descType::UseIndex_)>(do_verification, - init_method, - do_dumpout, - time_kernel, - inLengths, - reduceDims, - alpha, - beta); + std::array arrReduceDims; + + std::copy(reduceDims.begin(), reduceDims.end(), arrReduceDims.begin()); + + pass = pass && profile_reduce_impl_impl(descType::ReduceOpId_), + descType::PropagateNan_, + descType::UseIndex_>(do_verification, + init_method, + do_dumpout, + time_kernel, + inLengths, + arrReduceDims, + alpha, + beta); matched = true; });